LCOV - code coverage report
Current view: top level - kernel - futex.c (source / functions) Hit Total Coverage
Test: Real Lines: 240 892 26.9 %
Date: 2020-10-17 15:46:16 Functions: 1 68 1.5 %
Legend: Neither, QEMU, Real, Both Branches: 0 0 -

           Branch data     Line data    Source code
       1                 :            : // SPDX-License-Identifier: GPL-2.0-or-later
       2                 :            : /*
       3                 :            :  *  Fast Userspace Mutexes (which I call "Futexes!").
       4                 :            :  *  (C) Rusty Russell, IBM 2002
       5                 :            :  *
       6                 :            :  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
       7                 :            :  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
       8                 :            :  *
       9                 :            :  *  Removed page pinning, fix privately mapped COW pages and other cleanups
      10                 :            :  *  (C) Copyright 2003, 2004 Jamie Lokier
      11                 :            :  *
      12                 :            :  *  Robust futex support started by Ingo Molnar
      13                 :            :  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
      14                 :            :  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
      15                 :            :  *
      16                 :            :  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
      17                 :            :  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
      18                 :            :  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
      19                 :            :  *
      20                 :            :  *  PRIVATE futexes by Eric Dumazet
      21                 :            :  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
      22                 :            :  *
      23                 :            :  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
      24                 :            :  *  Copyright (C) IBM Corporation, 2009
      25                 :            :  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
      26                 :            :  *
      27                 :            :  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
      28                 :            :  *  enough at me, Linus for the original (flawed) idea, Matthew
      29                 :            :  *  Kirkwood for proof-of-concept implementation.
      30                 :            :  *
      31                 :            :  *  "The futexes are also cursed."
      32                 :            :  *  "But they come in a choice of three flavours!"
      33                 :            :  */
      34                 :            : #include <linux/compat.h>
      35                 :            : #include <linux/slab.h>
      36                 :            : #include <linux/poll.h>
      37                 :            : #include <linux/fs.h>
      38                 :            : #include <linux/file.h>
      39                 :            : #include <linux/jhash.h>
      40                 :            : #include <linux/init.h>
      41                 :            : #include <linux/futex.h>
      42                 :            : #include <linux/mount.h>
      43                 :            : #include <linux/pagemap.h>
      44                 :            : #include <linux/syscalls.h>
      45                 :            : #include <linux/signal.h>
      46                 :            : #include <linux/export.h>
      47                 :            : #include <linux/magic.h>
      48                 :            : #include <linux/pid.h>
      49                 :            : #include <linux/nsproxy.h>
      50                 :            : #include <linux/ptrace.h>
      51                 :            : #include <linux/sched/rt.h>
      52                 :            : #include <linux/sched/wake_q.h>
      53                 :            : #include <linux/sched/mm.h>
      54                 :            : #include <linux/hugetlb.h>
      55                 :            : #include <linux/freezer.h>
      56                 :            : #include <linux/memblock.h>
      57                 :            : #include <linux/fault-inject.h>
      58                 :            : #include <linux/refcount.h>
      59                 :            : 
      60                 :            : #include <asm/futex.h>
      61                 :            : 
      62                 :            : #include "locking/rtmutex_common.h"
      63                 :            : 
      64                 :            : /*
      65                 :            :  * READ this before attempting to hack on futexes!
      66                 :            :  *
      67                 :            :  * Basic futex operation and ordering guarantees
      68                 :            :  * =============================================
      69                 :            :  *
      70                 :            :  * The waiter reads the futex value in user space and calls
      71                 :            :  * futex_wait(). This function computes the hash bucket and acquires
      72                 :            :  * the hash bucket lock. After that it reads the futex user space value
      73                 :            :  * again and verifies that the data has not changed. If it has not changed
      74                 :            :  * it enqueues itself into the hash bucket, releases the hash bucket lock
      75                 :            :  * and schedules.
      76                 :            :  *
      77                 :            :  * The waker side modifies the user space value of the futex and calls
      78                 :            :  * futex_wake(). This function computes the hash bucket and acquires the
      79                 :            :  * hash bucket lock. Then it looks for waiters on that futex in the hash
      80                 :            :  * bucket and wakes them.
      81                 :            :  *
      82                 :            :  * In futex wake up scenarios where no tasks are blocked on a futex, taking
      83                 :            :  * the hb spinlock can be avoided and simply return. In order for this
      84                 :            :  * optimization to work, ordering guarantees must exist so that the waiter
      85                 :            :  * being added to the list is acknowledged when the list is concurrently being
      86                 :            :  * checked by the waker, avoiding scenarios like the following:
      87                 :            :  *
      88                 :            :  * CPU 0                               CPU 1
      89                 :            :  * val = *futex;
      90                 :            :  * sys_futex(WAIT, futex, val);
      91                 :            :  *   futex_wait(futex, val);
      92                 :            :  *   uval = *futex;
      93                 :            :  *                                     *futex = newval;
      94                 :            :  *                                     sys_futex(WAKE, futex);
      95                 :            :  *                                       futex_wake(futex);
      96                 :            :  *                                       if (queue_empty())
      97                 :            :  *                                         return;
      98                 :            :  *   if (uval == val)
      99                 :            :  *      lock(hash_bucket(futex));
     100                 :            :  *      queue();
     101                 :            :  *     unlock(hash_bucket(futex));
     102                 :            :  *     schedule();
     103                 :            :  *
     104                 :            :  * This would cause the waiter on CPU 0 to wait forever because it
     105                 :            :  * missed the transition of the user space value from val to newval
     106                 :            :  * and the waker did not find the waiter in the hash bucket queue.
     107                 :            :  *
     108                 :            :  * The correct serialization ensures that a waiter either observes
     109                 :            :  * the changed user space value before blocking or is woken by a
     110                 :            :  * concurrent waker:
     111                 :            :  *
     112                 :            :  * CPU 0                                 CPU 1
     113                 :            :  * val = *futex;
     114                 :            :  * sys_futex(WAIT, futex, val);
     115                 :            :  *   futex_wait(futex, val);
     116                 :            :  *
     117                 :            :  *   waiters++; (a)
     118                 :            :  *   smp_mb(); (A) <-- paired with -.
     119                 :            :  *                                  |
     120                 :            :  *   lock(hash_bucket(futex));      |
     121                 :            :  *                                  |
     122                 :            :  *   uval = *futex;                 |
     123                 :            :  *                                  |        *futex = newval;
     124                 :            :  *                                  |        sys_futex(WAKE, futex);
     125                 :            :  *                                  |          futex_wake(futex);
     126                 :            :  *                                  |
     127                 :            :  *                                  `--------> smp_mb(); (B)
     128                 :            :  *   if (uval == val)
     129                 :            :  *     queue();
     130                 :            :  *     unlock(hash_bucket(futex));
     131                 :            :  *     schedule();                         if (waiters)
     132                 :            :  *                                           lock(hash_bucket(futex));
     133                 :            :  *   else                                    wake_waiters(futex);
     134                 :            :  *     waiters--; (b)                        unlock(hash_bucket(futex));
     135                 :            :  *
     136                 :            :  * Where (A) orders the waiters increment and the futex value read through
     137                 :            :  * atomic operations (see hb_waiters_inc) and where (B) orders the write
     138                 :            :  * to futex and the waiters read -- this is done by the barriers for both
     139                 :            :  * shared and private futexes in get_futex_key_refs().
     140                 :            :  *
     141                 :            :  * This yields the following case (where X:=waiters, Y:=futex):
     142                 :            :  *
     143                 :            :  *      X = Y = 0
     144                 :            :  *
     145                 :            :  *      w[X]=1          w[Y]=1
     146                 :            :  *      MB              MB
     147                 :            :  *      r[Y]=y          r[X]=x
     148                 :            :  *
     149                 :            :  * Which guarantees that x==0 && y==0 is impossible; which translates back into
     150                 :            :  * the guarantee that we cannot both miss the futex variable change and the
     151                 :            :  * enqueue.
     152                 :            :  *
     153                 :            :  * Note that a new waiter is accounted for in (a) even when it is possible that
     154                 :            :  * the wait call can return error, in which case we backtrack from it in (b).
     155                 :            :  * Refer to the comment in queue_lock().
     156                 :            :  *
     157                 :            :  * Similarly, in order to account for waiters being requeued on another
     158                 :            :  * address we always increment the waiters for the destination bucket before
     159                 :            :  * acquiring the lock. It then decrements them again  after releasing it -
     160                 :            :  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
     161                 :            :  * will do the additional required waiter count housekeeping. This is done for
     162                 :            :  * double_lock_hb() and double_unlock_hb(), respectively.
     163                 :            :  */
     164                 :            : 
     165                 :            : #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
     166                 :            : #define futex_cmpxchg_enabled 1
     167                 :            : #else
     168                 :            : static int  __read_mostly futex_cmpxchg_enabled;
     169                 :            : #endif
     170                 :            : 
     171                 :            : /*
     172                 :            :  * Futex flags used to encode options to functions and preserve them across
     173                 :            :  * restarts.
     174                 :            :  */
     175                 :            : #ifdef CONFIG_MMU
     176                 :            : # define FLAGS_SHARED           0x01
     177                 :            : #else
     178                 :            : /*
     179                 :            :  * NOMMU does not have per process address space. Let the compiler optimize
     180                 :            :  * code away.
     181                 :            :  */
     182                 :            : # define FLAGS_SHARED           0x00
     183                 :            : #endif
     184                 :            : #define FLAGS_CLOCKRT           0x02
     185                 :            : #define FLAGS_HAS_TIMEOUT       0x04
     186                 :            : 
     187                 :            : /*
     188                 :            :  * Priority Inheritance state:
     189                 :            :  */
     190                 :            : struct futex_pi_state {
     191                 :            :         /*
     192                 :            :          * list of 'owned' pi_state instances - these have to be
     193                 :            :          * cleaned up in do_exit() if the task exits prematurely:
     194                 :            :          */
     195                 :            :         struct list_head list;
     196                 :            : 
     197                 :            :         /*
     198                 :            :          * The PI object:
     199                 :            :          */
     200                 :            :         struct rt_mutex pi_mutex;
     201                 :            : 
     202                 :            :         struct task_struct *owner;
     203                 :            :         refcount_t refcount;
     204                 :            : 
     205                 :            :         union futex_key key;
     206                 :            : } __randomize_layout;
     207                 :            : 
     208                 :            : /**
     209                 :            :  * struct futex_q - The hashed futex queue entry, one per waiting task
     210                 :            :  * @list:               priority-sorted list of tasks waiting on this futex
     211                 :            :  * @task:               the task waiting on the futex
     212                 :            :  * @lock_ptr:           the hash bucket lock
     213                 :            :  * @key:                the key the futex is hashed on
     214                 :            :  * @pi_state:           optional priority inheritance state
     215                 :            :  * @rt_waiter:          rt_waiter storage for use with requeue_pi
     216                 :            :  * @requeue_pi_key:     the requeue_pi target futex key
     217                 :            :  * @bitset:             bitset for the optional bitmasked wakeup
     218                 :            :  *
     219                 :            :  * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
     220                 :            :  * we can wake only the relevant ones (hashed queues may be shared).
     221                 :            :  *
     222                 :            :  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
     223                 :            :  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
     224                 :            :  * The order of wakeup is always to make the first condition true, then
     225                 :            :  * the second.
     226                 :            :  *
     227                 :            :  * PI futexes are typically woken before they are removed from the hash list via
     228                 :            :  * the rt_mutex code. See unqueue_me_pi().
     229                 :            :  */
     230                 :            : struct futex_q {
     231                 :            :         struct plist_node list;
     232                 :            : 
     233                 :            :         struct task_struct *task;
     234                 :            :         spinlock_t *lock_ptr;
     235                 :            :         union futex_key key;
     236                 :            :         struct futex_pi_state *pi_state;
     237                 :            :         struct rt_mutex_waiter *rt_waiter;
     238                 :            :         union futex_key *requeue_pi_key;
     239                 :            :         u32 bitset;
     240                 :            : } __randomize_layout;
     241                 :            : 
     242                 :            : static const struct futex_q futex_q_init = {
     243                 :            :         /* list gets initialized in queue_me()*/
     244                 :            :         .key = FUTEX_KEY_INIT,
     245                 :            :         .bitset = FUTEX_BITSET_MATCH_ANY
     246                 :            : };
     247                 :            : 
     248                 :            : /*
     249                 :            :  * Hash buckets are shared by all the futex_keys that hash to the same
     250                 :            :  * location.  Each key may have multiple futex_q structures, one for each task
     251                 :            :  * waiting on a futex.
     252                 :            :  */
     253                 :            : struct futex_hash_bucket {
     254                 :            :         atomic_t waiters;
     255                 :            :         spinlock_t lock;
     256                 :            :         struct plist_head chain;
     257                 :            : } ____cacheline_aligned_in_smp;
     258                 :            : 
     259                 :            : /*
     260                 :            :  * The base of the bucket array and its size are always used together
     261                 :            :  * (after initialization only in hash_futex()), so ensure that they
     262                 :            :  * reside in the same cacheline.
     263                 :            :  */
     264                 :            : static struct {
     265                 :            :         struct futex_hash_bucket *queues;
     266                 :            :         unsigned long            hashsize;
     267                 :            : } __futex_data __read_mostly __aligned(2*sizeof(long));
     268                 :            : #define futex_queues   (__futex_data.queues)
     269                 :            : #define futex_hashsize (__futex_data.hashsize)
     270                 :            : 
     271                 :            : 
     272                 :            : /*
     273                 :            :  * Fault injections for futexes.
     274                 :            :  */
     275                 :            : #ifdef CONFIG_FAIL_FUTEX
     276                 :            : 
     277                 :            : static struct {
     278                 :            :         struct fault_attr attr;
     279                 :            : 
     280                 :            :         bool ignore_private;
     281                 :            : } fail_futex = {
     282                 :            :         .attr = FAULT_ATTR_INITIALIZER,
     283                 :            :         .ignore_private = false,
     284                 :            : };
     285                 :            : 
     286                 :            : static int __init setup_fail_futex(char *str)
     287                 :            : {
     288                 :            :         return setup_fault_attr(&fail_futex.attr, str);
     289                 :            : }
     290                 :            : __setup("fail_futex=", setup_fail_futex);
     291                 :            : 
     292                 :            : static bool should_fail_futex(bool fshared)
     293                 :            : {
     294                 :            :         if (fail_futex.ignore_private && !fshared)
     295                 :            :                 return false;
     296                 :            : 
     297                 :            :         return should_fail(&fail_futex.attr, 1);
     298                 :            : }
     299                 :            : 
     300                 :            : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
     301                 :            : 
     302                 :            : static int __init fail_futex_debugfs(void)
     303                 :            : {
     304                 :            :         umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
     305                 :            :         struct dentry *dir;
     306                 :            : 
     307                 :            :         dir = fault_create_debugfs_attr("fail_futex", NULL,
     308                 :            :                                         &fail_futex.attr);
     309                 :            :         if (IS_ERR(dir))
     310                 :            :                 return PTR_ERR(dir);
     311                 :            : 
     312                 :            :         debugfs_create_bool("ignore-private", mode, dir,
     313                 :            :                             &fail_futex.ignore_private);
     314                 :            :         return 0;
     315                 :            : }
     316                 :            : 
     317                 :            : late_initcall(fail_futex_debugfs);
     318                 :            : 
     319                 :            : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
     320                 :            : 
     321                 :            : #else
     322                 :            : static inline bool should_fail_futex(bool fshared)
     323                 :            : {
     324                 :            :         return false;
     325                 :            : }
     326                 :            : #endif /* CONFIG_FAIL_FUTEX */
     327                 :            : 
     328                 :            : #ifdef CONFIG_COMPAT
     329                 :            : static void compat_exit_robust_list(struct task_struct *curr);
     330                 :            : #else
     331                 :            : static inline void compat_exit_robust_list(struct task_struct *curr) { }
     332                 :            : #endif
     333                 :            : 
     334                 :          3 : static inline void futex_get_mm(union futex_key *key)
     335                 :            : {
     336                 :          3 :         mmgrab(key->private.mm);
     337                 :            :         /*
     338                 :            :          * Ensure futex_get_mm() implies a full barrier such that
     339                 :            :          * get_futex_key() implies a full barrier. This is relied upon
     340                 :            :          * as smp_mb(); (B), see the ordering comment above.
     341                 :            :          */
     342                 :          3 :         smp_mb__after_atomic();
     343                 :          3 : }
     344                 :            : 
     345                 :            : /*
     346                 :            :  * Reflects a new waiter being added to the waitqueue.
     347                 :            :  */
     348                 :          3 : static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
     349                 :            : {
     350                 :            : #ifdef CONFIG_SMP
     351                 :          3 :         atomic_inc(&hb->waiters);
     352                 :            :         /*
     353                 :            :          * Full barrier (A), see the ordering comment above.
     354                 :            :          */
     355                 :          3 :         smp_mb__after_atomic();
     356                 :            : #endif
     357                 :          3 : }
     358                 :            : 
     359                 :            : /*
     360                 :            :  * Reflects a waiter being removed from the waitqueue by wakeup
     361                 :            :  * paths.
     362                 :            :  */
     363                 :            : static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
     364                 :            : {
     365                 :            : #ifdef CONFIG_SMP
     366                 :          3 :         atomic_dec(&hb->waiters);
     367                 :            : #endif
     368                 :            : }
     369                 :            : 
     370                 :            : static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
     371                 :            : {
     372                 :            : #ifdef CONFIG_SMP
     373                 :            :         return atomic_read(&hb->waiters);
     374                 :            : #else
     375                 :            :         return 1;
     376                 :            : #endif
     377                 :            : }
     378                 :            : 
     379                 :            : /**
     380                 :            :  * hash_futex - Return the hash bucket in the global hash
     381                 :            :  * @key:        Pointer to the futex key for which the hash is calculated
     382                 :            :  *
     383                 :            :  * We hash on the keys returned from get_futex_key (see below) and return the
     384                 :            :  * corresponding hash bucket in the global hash.
     385                 :            :  */
     386                 :          3 : static struct futex_hash_bucket *hash_futex(union futex_key *key)
     387                 :            : {
     388                 :          3 :         u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
     389                 :            :                           key->both.offset);
     390                 :            : 
     391                 :          3 :         return &futex_queues[hash & (futex_hashsize - 1)];
     392                 :            : }
     393                 :            : 
     394                 :            : 
     395                 :            : /**
     396                 :            :  * match_futex - Check whether two futex keys are equal
     397                 :            :  * @key1:       Pointer to key1
     398                 :            :  * @key2:       Pointer to key2
     399                 :            :  *
     400                 :            :  * Return 1 if two futex_keys are equal, 0 otherwise.
     401                 :            :  */
     402                 :            : static inline int match_futex(union futex_key *key1, union futex_key *key2)
     403                 :            : {
     404                 :          0 :         return (key1 && key2
     405                 :          3 :                 && key1->both.word == key2->both.word
     406                 :          3 :                 && key1->both.ptr == key2->both.ptr
     407                 :          3 :                 && key1->both.offset == key2->both.offset);
     408                 :            : }
     409                 :            : 
     410                 :            : /*
     411                 :            :  * Take a reference to the resource addressed by a key.
     412                 :            :  * Can be called while holding spinlocks.
     413                 :            :  *
     414                 :            :  */
     415                 :          3 : static void get_futex_key_refs(union futex_key *key)
     416                 :            : {
     417                 :          3 :         if (!key->both.ptr)
     418                 :            :                 return;
     419                 :            : 
     420                 :            :         /*
     421                 :            :          * On MMU less systems futexes are always "private" as there is no per
     422                 :            :          * process address space. We need the smp wmb nevertheless - yes,
     423                 :            :          * arch/blackfin has MMU less SMP ...
     424                 :            :          */
     425                 :            :         if (!IS_ENABLED(CONFIG_MMU)) {
     426                 :            :                 smp_mb(); /* explicit smp_mb(); (B) */
     427                 :            :                 return;
     428                 :            :         }
     429                 :            : 
     430                 :          3 :         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
     431                 :            :         case FUT_OFF_INODE:
     432                 :          0 :                 smp_mb();               /* explicit smp_mb(); (B) */
     433                 :          0 :                 break;
     434                 :            :         case FUT_OFF_MMSHARED:
     435                 :          3 :                 futex_get_mm(key); /* implies smp_mb(); (B) */
     436                 :          3 :                 break;
     437                 :            :         default:
     438                 :            :                 /*
     439                 :            :                  * Private futexes do not hold reference on an inode or
     440                 :            :                  * mm, therefore the only purpose of calling get_futex_key_refs
     441                 :            :                  * is because we need the barrier for the lockless waiter check.
     442                 :            :                  */
     443                 :          3 :                 smp_mb(); /* explicit smp_mb(); (B) */
     444                 :            :         }
     445                 :            : }
     446                 :            : 
     447                 :            : /*
     448                 :            :  * Drop a reference to the resource addressed by a key.
     449                 :            :  * The hash bucket spinlock must not be held. This is
     450                 :            :  * a no-op for private futexes, see comment in the get
     451                 :            :  * counterpart.
     452                 :            :  */
     453                 :          3 : static void drop_futex_key_refs(union futex_key *key)
     454                 :            : {
     455                 :          3 :         if (!key->both.ptr) {
     456                 :            :                 /* If we're here then we tried to put a key we failed to get */
     457                 :          0 :                 WARN_ON_ONCE(1);
     458                 :            :                 return;
     459                 :            :         }
     460                 :            : 
     461                 :            :         if (!IS_ENABLED(CONFIG_MMU))
     462                 :            :                 return;
     463                 :            : 
     464                 :          3 :         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
     465                 :            :         case FUT_OFF_INODE:
     466                 :            :                 break;
     467                 :            :         case FUT_OFF_MMSHARED:
     468                 :          3 :                 mmdrop(key->private.mm);
     469                 :          3 :                 break;
     470                 :            :         }
     471                 :            : }
     472                 :            : 
     473                 :            : enum futex_access {
     474                 :            :         FUTEX_READ,
     475                 :            :         FUTEX_WRITE
     476                 :            : };
     477                 :            : 
     478                 :            : /**
     479                 :            :  * futex_setup_timer - set up the sleeping hrtimer.
     480                 :            :  * @time:       ptr to the given timeout value
     481                 :            :  * @timeout:    the hrtimer_sleeper structure to be set up
     482                 :            :  * @flags:      futex flags
     483                 :            :  * @range_ns:   optional range in ns
     484                 :            :  *
     485                 :            :  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
     486                 :            :  *         value given
     487                 :            :  */
     488                 :            : static inline struct hrtimer_sleeper *
     489                 :          3 : futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
     490                 :            :                   int flags, u64 range_ns)
     491                 :            : {
     492                 :          3 :         if (!time)
     493                 :            :                 return NULL;
     494                 :            : 
     495                 :          3 :         hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
     496                 :            :                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
     497                 :            :                                       HRTIMER_MODE_ABS);
     498                 :            :         /*
     499                 :            :          * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
     500                 :            :          * effectively the same as calling hrtimer_set_expires().
     501                 :            :          */
     502                 :          3 :         hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
     503                 :            : 
     504                 :          3 :         return timeout;
     505                 :            : }
     506                 :            : 
     507                 :            : /*
     508                 :            :  * Generate a machine wide unique identifier for this inode.
     509                 :            :  *
     510                 :            :  * This relies on u64 not wrapping in the life-time of the machine; which with
     511                 :            :  * 1ns resolution means almost 585 years.
     512                 :            :  *
     513                 :            :  * This further relies on the fact that a well formed program will not unmap
     514                 :            :  * the file while it has a (shared) futex waiting on it. This mapping will have
     515                 :            :  * a file reference which pins the mount and inode.
     516                 :            :  *
     517                 :            :  * If for some reason an inode gets evicted and read back in again, it will get
     518                 :            :  * a new sequence number and will _NOT_ match, even though it is the exact same
     519                 :            :  * file.
     520                 :            :  *
     521                 :            :  * It is important that match_futex() will never have a false-positive, esp.
     522                 :            :  * for PI futexes that can mess up the state. The above argues that false-negatives
     523                 :            :  * are only possible for malformed programs.
     524                 :            :  */
     525                 :          0 : static u64 get_inode_sequence_number(struct inode *inode)
     526                 :            : {
     527                 :            :         static atomic64_t i_seq;
     528                 :            :         u64 old;
     529                 :            : 
     530                 :            :         /* Does the inode already have a sequence number? */
     531                 :          0 :         old = atomic64_read(&inode->i_sequence);
     532                 :          0 :         if (likely(old))
     533                 :            :                 return old;
     534                 :            : 
     535                 :            :         for (;;) {
     536                 :          0 :                 u64 new = atomic64_add_return(1, &i_seq);
     537                 :          0 :                 if (WARN_ON_ONCE(!new))
     538                 :          0 :                         continue;
     539                 :            : 
     540                 :          0 :                 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
     541                 :          0 :                 if (old)
     542                 :            :                         return old;
     543                 :          0 :                 return new;
     544                 :          0 :         }
     545                 :            : }
     546                 :            : 
     547                 :            : /**
     548                 :            :  * get_futex_key() - Get parameters which are the keys for a futex
     549                 :            :  * @uaddr:      virtual address of the futex
     550                 :            :  * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
     551                 :            :  * @key:        address where result is stored.
     552                 :            :  * @rw:         mapping needs to be read/write (values: FUTEX_READ,
     553                 :            :  *              FUTEX_WRITE)
     554                 :            :  *
     555                 :            :  * Return: a negative error code or 0
     556                 :            :  *
     557                 :            :  * The key words are stored in @key on success.
     558                 :            :  *
     559                 :            :  * For shared mappings (when @fshared), the key is:
     560                 :            :  *   ( inode->i_sequence, page->index, offset_within_page )
     561                 :            :  * [ also see get_inode_sequence_number() ]
     562                 :            :  *
     563                 :            :  * For private mappings (or when !@fshared), the key is:
     564                 :            :  *   ( current->mm, address, 0 )
     565                 :            :  *
     566                 :            :  * This allows (cross process, where applicable) identification of the futex
     567                 :            :  * without keeping the page pinned for the duration of the FUTEX_WAIT.
     568                 :            :  *
     569                 :            :  * lock_page() might sleep, the caller should not hold a spinlock.
     570                 :            :  */
     571                 :            : static int
     572                 :          3 : get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
     573                 :            : {
     574                 :          3 :         unsigned long address = (unsigned long)uaddr;
     575                 :          3 :         struct mm_struct *mm = current->mm;
     576                 :            :         struct page *page, *tail;
     577                 :            :         struct address_space *mapping;
     578                 :            :         int err, ro = 0;
     579                 :            : 
     580                 :            :         /*
     581                 :            :          * The futex address must be "naturally" aligned.
     582                 :            :          */
     583                 :          3 :         key->both.offset = address % PAGE_SIZE;
     584                 :          3 :         if (unlikely((address % sizeof(u32)) != 0))
     585                 :            :                 return -EINVAL;
     586                 :          3 :         address -= key->both.offset;
     587                 :            : 
     588                 :          3 :         if (unlikely(!access_ok(uaddr, sizeof(u32))))
     589                 :            :                 return -EFAULT;
     590                 :            : 
     591                 :            :         if (unlikely(should_fail_futex(fshared)))
     592                 :            :                 return -EFAULT;
     593                 :            : 
     594                 :            :         /*
     595                 :            :          * PROCESS_PRIVATE futexes are fast.
     596                 :            :          * As the mm cannot disappear under us and the 'key' only needs
     597                 :            :          * virtual address, we dont even have to find the underlying vma.
     598                 :            :          * Note : We do have to check 'uaddr' is a valid user address,
     599                 :            :          *        but access_ok() should be faster than find_vma()
     600                 :            :          */
     601                 :          3 :         if (!fshared) {
     602                 :          3 :                 key->private.mm = mm;
     603                 :          3 :                 key->private.address = address;
     604                 :          3 :                 get_futex_key_refs(key);  /* implies smp_mb(); (B) */
     605                 :          3 :                 return 0;
     606                 :            :         }
     607                 :            : 
     608                 :            : again:
     609                 :            :         /* Ignore any VERIFY_READ mapping (futex common case) */
     610                 :            :         if (unlikely(should_fail_futex(fshared)))
     611                 :            :                 return -EFAULT;
     612                 :            : 
     613                 :          3 :         err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
     614                 :            :         /*
     615                 :            :          * If write access is not required (eg. FUTEX_WAIT), try
     616                 :            :          * and get read-only access.
     617                 :            :          */
     618                 :          3 :         if (err == -EFAULT && rw == FUTEX_READ) {
     619                 :          0 :                 err = get_user_pages_fast(address, 1, 0, &page);
     620                 :            :                 ro = 1;
     621                 :            :         }
     622                 :          3 :         if (err < 0)
     623                 :          0 :                 return err;
     624                 :            :         else
     625                 :            :                 err = 0;
     626                 :            : 
     627                 :            :         /*
     628                 :            :          * The treatment of mapping from this point on is critical. The page
     629                 :            :          * lock protects many things but in this context the page lock
     630                 :            :          * stabilizes mapping, prevents inode freeing in the shared
     631                 :            :          * file-backed region case and guards against movement to swap cache.
     632                 :            :          *
     633                 :            :          * Strictly speaking the page lock is not needed in all cases being
     634                 :            :          * considered here and page lock forces unnecessarily serialization
     635                 :            :          * From this point on, mapping will be re-verified if necessary and
     636                 :            :          * page lock will be acquired only if it is unavoidable
     637                 :            :          *
     638                 :            :          * Mapping checks require the head page for any compound page so the
     639                 :            :          * head page and mapping is looked up now. For anonymous pages, it
     640                 :            :          * does not matter if the page splits in the future as the key is
     641                 :            :          * based on the address. For filesystem-backed pages, the tail is
     642                 :            :          * required as the index of the page determines the key. For
     643                 :            :          * base pages, there is no tail page and tail == page.
     644                 :            :          */
     645                 :          3 :         tail = page;
     646                 :          3 :         page = compound_head(page);
     647                 :          3 :         mapping = READ_ONCE(page->mapping);
     648                 :            : 
     649                 :            :         /*
     650                 :            :          * If page->mapping is NULL, then it cannot be a PageAnon
     651                 :            :          * page; but it might be the ZERO_PAGE or in the gate area or
     652                 :            :          * in a special mapping (all cases which we are happy to fail);
     653                 :            :          * or it may have been a good file page when get_user_pages_fast
     654                 :            :          * found it, but truncated or holepunched or subjected to
     655                 :            :          * invalidate_complete_page2 before we got the page lock (also
     656                 :            :          * cases which we are happy to fail).  And we hold a reference,
     657                 :            :          * so refcount care in invalidate_complete_page's remove_mapping
     658                 :            :          * prevents drop_caches from setting mapping to NULL beneath us.
     659                 :            :          *
     660                 :            :          * The case we do have to guard against is when memory pressure made
     661                 :            :          * shmem_writepage move it from filecache to swapcache beneath us:
     662                 :            :          * an unlikely race, but we do need to retry for page->mapping.
     663                 :            :          */
     664                 :          3 :         if (unlikely(!mapping)) {
     665                 :            :                 int shmem_swizzled;
     666                 :            : 
     667                 :            :                 /*
     668                 :            :                  * Page lock is required to identify which special case above
     669                 :            :                  * applies. If this is really a shmem page then the page lock
     670                 :            :                  * will prevent unexpected transitions.
     671                 :            :                  */
     672                 :          0 :                 lock_page(page);
     673                 :          0 :                 shmem_swizzled = PageSwapCache(page) || page->mapping;
     674                 :          0 :                 unlock_page(page);
     675                 :          0 :                 put_page(page);
     676                 :            : 
     677                 :          0 :                 if (shmem_swizzled)
     678                 :            :                         goto again;
     679                 :            : 
     680                 :            :                 return -EFAULT;
     681                 :            :         }
     682                 :            : 
     683                 :            :         /*
     684                 :            :          * Private mappings are handled in a simple way.
     685                 :            :          *
     686                 :            :          * If the futex key is stored on an anonymous page, then the associated
     687                 :            :          * object is the mm which is implicitly pinned by the calling process.
     688                 :            :          *
     689                 :            :          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
     690                 :            :          * it's a read-only handle, it's expected that futexes attach to
     691                 :            :          * the object not the particular process.
     692                 :            :          */
     693                 :          3 :         if (PageAnon(page)) {
     694                 :            :                 /*
     695                 :            :                  * A RO anonymous page will never change and thus doesn't make
     696                 :            :                  * sense for futex operations.
     697                 :            :                  */
     698                 :          3 :                 if (unlikely(should_fail_futex(fshared)) || ro) {
     699                 :            :                         err = -EFAULT;
     700                 :            :                         goto out;
     701                 :            :                 }
     702                 :            : 
     703                 :          3 :                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
     704                 :          3 :                 key->private.mm = mm;
     705                 :          3 :                 key->private.address = address;
     706                 :            : 
     707                 :            :         } else {
     708                 :            :                 struct inode *inode;
     709                 :            : 
     710                 :            :                 /*
     711                 :            :                  * The associated futex object in this case is the inode and
     712                 :            :                  * the page->mapping must be traversed. Ordinarily this should
     713                 :            :                  * be stabilised under page lock but it's not strictly
     714                 :            :                  * necessary in this case as we just want to pin the inode, not
     715                 :            :                  * update the radix tree or anything like that.
     716                 :            :                  *
     717                 :            :                  * The RCU read lock is taken as the inode is finally freed
     718                 :            :                  * under RCU. If the mapping still matches expectations then the
     719                 :            :                  * mapping->host can be safely accessed as being a valid inode.
     720                 :            :                  */
     721                 :            :                 rcu_read_lock();
     722                 :            : 
     723                 :          0 :                 if (READ_ONCE(page->mapping) != mapping) {
     724                 :            :                         rcu_read_unlock();
     725                 :          0 :                         put_page(page);
     726                 :            : 
     727                 :          0 :                         goto again;
     728                 :            :                 }
     729                 :            : 
     730                 :          0 :                 inode = READ_ONCE(mapping->host);
     731                 :          0 :                 if (!inode) {
     732                 :            :                         rcu_read_unlock();
     733                 :          0 :                         put_page(page);
     734                 :            : 
     735                 :          0 :                         goto again;
     736                 :            :                 }
     737                 :            : 
     738                 :          0 :                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
     739                 :          0 :                 key->shared.i_seq = get_inode_sequence_number(inode);
     740                 :          0 :                 key->shared.pgoff = basepage_index(tail);
     741                 :            :                 rcu_read_unlock();
     742                 :            :         }
     743                 :            : 
     744                 :          3 :         get_futex_key_refs(key); /* implies smp_mb(); (B) */
     745                 :            : 
     746                 :            : out:
     747                 :          3 :         put_page(page);
     748                 :          3 :         return err;
     749                 :            : }
     750                 :            : 
     751                 :            : static inline void put_futex_key(union futex_key *key)
     752                 :            : {
     753                 :          3 :         drop_futex_key_refs(key);
     754                 :            : }
     755                 :            : 
     756                 :            : /**
     757                 :            :  * fault_in_user_writeable() - Fault in user address and verify RW access
     758                 :            :  * @uaddr:      pointer to faulting user space address
     759                 :            :  *
     760                 :            :  * Slow path to fixup the fault we just took in the atomic write
     761                 :            :  * access to @uaddr.
     762                 :            :  *
     763                 :            :  * We have no generic implementation of a non-destructive write to the
     764                 :            :  * user address. We know that we faulted in the atomic pagefault
     765                 :            :  * disabled section so we can as well avoid the #PF overhead by
     766                 :            :  * calling get_user_pages() right away.
     767                 :            :  */
     768                 :          0 : static int fault_in_user_writeable(u32 __user *uaddr)
     769                 :            : {
     770                 :          0 :         struct mm_struct *mm = current->mm;
     771                 :            :         int ret;
     772                 :            : 
     773                 :          0 :         down_read(&mm->mmap_sem);
     774                 :          0 :         ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
     775                 :            :                                FAULT_FLAG_WRITE, NULL);
     776                 :          0 :         up_read(&mm->mmap_sem);
     777                 :            : 
     778                 :          0 :         return ret < 0 ? ret : 0;
     779                 :            : }
     780                 :            : 
     781                 :            : /**
     782                 :            :  * futex_top_waiter() - Return the highest priority waiter on a futex
     783                 :            :  * @hb:         the hash bucket the futex_q's reside in
     784                 :            :  * @key:        the futex key (to distinguish it from other futex futex_q's)
     785                 :            :  *
     786                 :            :  * Must be called with the hb lock held.
     787                 :            :  */
     788                 :          0 : static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
     789                 :            :                                         union futex_key *key)
     790                 :            : {
     791                 :            :         struct futex_q *this;
     792                 :            : 
     793                 :          0 :         plist_for_each_entry(this, &hb->chain, list) {
     794                 :          0 :                 if (match_futex(&this->key, key))
     795                 :          0 :                         return this;
     796                 :            :         }
     797                 :            :         return NULL;
     798                 :            : }
     799                 :            : 
     800                 :          3 : static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
     801                 :            :                                       u32 uval, u32 newval)
     802                 :            : {
     803                 :            :         int ret;
     804                 :            : 
     805                 :            :         pagefault_disable();
     806                 :          3 :         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
     807                 :            :         pagefault_enable();
     808                 :            : 
     809                 :          3 :         return ret;
     810                 :            : }
     811                 :            : 
     812                 :          3 : static int get_futex_value_locked(u32 *dest, u32 __user *from)
     813                 :            : {
     814                 :            :         int ret;
     815                 :            : 
     816                 :            :         pagefault_disable();
     817                 :          3 :         ret = __get_user(*dest, from);
     818                 :            :         pagefault_enable();
     819                 :            : 
     820                 :          3 :         return ret ? -EFAULT : 0;
     821                 :            : }
     822                 :            : 
     823                 :            : 
     824                 :            : /*
     825                 :            :  * PI code:
     826                 :            :  */
     827                 :          0 : static int refill_pi_state_cache(void)
     828                 :            : {
     829                 :            :         struct futex_pi_state *pi_state;
     830                 :            : 
     831                 :          0 :         if (likely(current->pi_state_cache))
     832                 :            :                 return 0;
     833                 :            : 
     834                 :          0 :         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
     835                 :            : 
     836                 :          0 :         if (!pi_state)
     837                 :            :                 return -ENOMEM;
     838                 :            : 
     839                 :          0 :         INIT_LIST_HEAD(&pi_state->list);
     840                 :            :         /* pi_mutex gets initialized later */
     841                 :          0 :         pi_state->owner = NULL;
     842                 :            :         refcount_set(&pi_state->refcount, 1);
     843                 :          0 :         pi_state->key = FUTEX_KEY_INIT;
     844                 :            : 
     845                 :          0 :         current->pi_state_cache = pi_state;
     846                 :            : 
     847                 :          0 :         return 0;
     848                 :            : }
     849                 :            : 
     850                 :          0 : static struct futex_pi_state *alloc_pi_state(void)
     851                 :            : {
     852                 :          0 :         struct futex_pi_state *pi_state = current->pi_state_cache;
     853                 :            : 
     854                 :          0 :         WARN_ON(!pi_state);
     855                 :          0 :         current->pi_state_cache = NULL;
     856                 :            : 
     857                 :          0 :         return pi_state;
     858                 :            : }
     859                 :            : 
     860                 :          0 : static void get_pi_state(struct futex_pi_state *pi_state)
     861                 :            : {
     862                 :          0 :         WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
     863                 :          0 : }
     864                 :            : 
     865                 :            : /*
     866                 :            :  * Drops a reference to the pi_state object and frees or caches it
     867                 :            :  * when the last reference is gone.
     868                 :            :  */
     869                 :          0 : static void put_pi_state(struct futex_pi_state *pi_state)
     870                 :            : {
     871                 :          0 :         if (!pi_state)
     872                 :            :                 return;
     873                 :            : 
     874                 :          0 :         if (!refcount_dec_and_test(&pi_state->refcount))
     875                 :            :                 return;
     876                 :            : 
     877                 :            :         /*
     878                 :            :          * If pi_state->owner is NULL, the owner is most probably dying
     879                 :            :          * and has cleaned up the pi_state already
     880                 :            :          */
     881                 :          0 :         if (pi_state->owner) {
     882                 :            :                 struct task_struct *owner;
     883                 :            : 
     884                 :          0 :                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
     885                 :          0 :                 owner = pi_state->owner;
     886                 :          0 :                 if (owner) {
     887                 :          0 :                         raw_spin_lock(&owner->pi_lock);
     888                 :          0 :                         list_del_init(&pi_state->list);
     889                 :            :                         raw_spin_unlock(&owner->pi_lock);
     890                 :            :                 }
     891                 :          0 :                 rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
     892                 :          0 :                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
     893                 :            :         }
     894                 :            : 
     895                 :          0 :         if (current->pi_state_cache) {
     896                 :          0 :                 kfree(pi_state);
     897                 :            :         } else {
     898                 :            :                 /*
     899                 :            :                  * pi_state->list is already empty.
     900                 :            :                  * clear pi_state->owner.
     901                 :            :                  * refcount is at 0 - put it back to 1.
     902                 :            :                  */
     903                 :          0 :                 pi_state->owner = NULL;
     904                 :            :                 refcount_set(&pi_state->refcount, 1);
     905                 :          0 :                 current->pi_state_cache = pi_state;
     906                 :            :         }
     907                 :            : }
     908                 :            : 
     909                 :            : #ifdef CONFIG_FUTEX_PI
     910                 :            : 
     911                 :            : /*
     912                 :            :  * This task is holding PI mutexes at exit time => bad.
     913                 :            :  * Kernel cleans up PI-state, but userspace is likely hosed.
     914                 :            :  * (Robust-futex cleanup is separate and might save the day for userspace.)
     915                 :            :  */
     916                 :          0 : static void exit_pi_state_list(struct task_struct *curr)
     917                 :            : {
     918                 :          0 :         struct list_head *next, *head = &curr->pi_state_list;
     919                 :            :         struct futex_pi_state *pi_state;
     920                 :            :         struct futex_hash_bucket *hb;
     921                 :          0 :         union futex_key key = FUTEX_KEY_INIT;
     922                 :            : 
     923                 :          0 :         if (!futex_cmpxchg_enabled)
     924                 :          0 :                 return;
     925                 :            :         /*
     926                 :            :          * We are a ZOMBIE and nobody can enqueue itself on
     927                 :            :          * pi_state_list anymore, but we have to be careful
     928                 :            :          * versus waiters unqueueing themselves:
     929                 :            :          */
     930                 :          0 :         raw_spin_lock_irq(&curr->pi_lock);
     931                 :          0 :         while (!list_empty(head)) {
     932                 :          0 :                 next = head->next;
     933                 :            :                 pi_state = list_entry(next, struct futex_pi_state, list);
     934                 :          0 :                 key = pi_state->key;
     935                 :          0 :                 hb = hash_futex(&key);
     936                 :            : 
     937                 :            :                 /*
     938                 :            :                  * We can race against put_pi_state() removing itself from the
     939                 :            :                  * list (a waiter going away). put_pi_state() will first
     940                 :            :                  * decrement the reference count and then modify the list, so
     941                 :            :                  * its possible to see the list entry but fail this reference
     942                 :            :                  * acquire.
     943                 :            :                  *
     944                 :            :                  * In that case; drop the locks to let put_pi_state() make
     945                 :            :                  * progress and retry the loop.
     946                 :            :                  */
     947                 :          0 :                 if (!refcount_inc_not_zero(&pi_state->refcount)) {
     948                 :          0 :                         raw_spin_unlock_irq(&curr->pi_lock);
     949                 :          0 :                         cpu_relax();
     950                 :          0 :                         raw_spin_lock_irq(&curr->pi_lock);
     951                 :          0 :                         continue;
     952                 :            :                 }
     953                 :          0 :                 raw_spin_unlock_irq(&curr->pi_lock);
     954                 :            : 
     955                 :            :                 spin_lock(&hb->lock);
     956                 :          0 :                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
     957                 :          0 :                 raw_spin_lock(&curr->pi_lock);
     958                 :            :                 /*
     959                 :            :                  * We dropped the pi-lock, so re-check whether this
     960                 :            :                  * task still owns the PI-state:
     961                 :            :                  */
     962                 :          0 :                 if (head->next != next) {
     963                 :            :                         /* retain curr->pi_lock for the loop invariant */
     964                 :            :                         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
     965                 :            :                         spin_unlock(&hb->lock);
     966                 :          0 :                         put_pi_state(pi_state);
     967                 :          0 :                         continue;
     968                 :            :                 }
     969                 :            : 
     970                 :          0 :                 WARN_ON(pi_state->owner != curr);
     971                 :          0 :                 WARN_ON(list_empty(&pi_state->list));
     972                 :            :                 list_del_init(&pi_state->list);
     973                 :          0 :                 pi_state->owner = NULL;
     974                 :            : 
     975                 :            :                 raw_spin_unlock(&curr->pi_lock);
     976                 :          0 :                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
     977                 :            :                 spin_unlock(&hb->lock);
     978                 :            : 
     979                 :          0 :                 rt_mutex_futex_unlock(&pi_state->pi_mutex);
     980                 :          0 :                 put_pi_state(pi_state);
     981                 :            : 
     982                 :          0 :                 raw_spin_lock_irq(&curr->pi_lock);
     983                 :            :         }
     984                 :          0 :         raw_spin_unlock_irq(&curr->pi_lock);
     985                 :            : }
     986                 :            : #else
     987                 :            : static inline void exit_pi_state_list(struct task_struct *curr) { }
     988                 :            : #endif
     989                 :            : 
     990                 :            : /*
     991                 :            :  * We need to check the following states:
     992                 :            :  *
     993                 :            :  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
     994                 :            :  *
     995                 :            :  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
     996                 :            :  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
     997                 :            :  *
     998                 :            :  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
     999                 :            :  *
    1000                 :            :  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
    1001                 :            :  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
    1002                 :            :  *
    1003                 :            :  * [6]  Found  | Found    | task      | 0         | 1      | Valid
    1004                 :            :  *
    1005                 :            :  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
    1006                 :            :  *
    1007                 :            :  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
    1008                 :            :  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
    1009                 :            :  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
    1010                 :            :  *
    1011                 :            :  * [1]  Indicates that the kernel can acquire the futex atomically. We
    1012                 :            :  *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
    1013                 :            :  *
    1014                 :            :  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
    1015                 :            :  *      thread is found then it indicates that the owner TID has died.
    1016                 :            :  *
    1017                 :            :  * [3]  Invalid. The waiter is queued on a non PI futex
    1018                 :            :  *
    1019                 :            :  * [4]  Valid state after exit_robust_list(), which sets the user space
    1020                 :            :  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
    1021                 :            :  *
    1022                 :            :  * [5]  The user space value got manipulated between exit_robust_list()
    1023                 :            :  *      and exit_pi_state_list()
    1024                 :            :  *
    1025                 :            :  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
    1026                 :            :  *      the pi_state but cannot access the user space value.
    1027                 :            :  *
    1028                 :            :  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
    1029                 :            :  *
    1030                 :            :  * [8]  Owner and user space value match
    1031                 :            :  *
    1032                 :            :  * [9]  There is no transient state which sets the user space TID to 0
    1033                 :            :  *      except exit_robust_list(), but this is indicated by the
    1034                 :            :  *      FUTEX_OWNER_DIED bit. See [4]
    1035                 :            :  *
    1036                 :            :  * [10] There is no transient state which leaves owner and user space
    1037                 :            :  *      TID out of sync.
    1038                 :            :  *
    1039                 :            :  *
    1040                 :            :  * Serialization and lifetime rules:
    1041                 :            :  *
    1042                 :            :  * hb->lock:
    1043                 :            :  *
    1044                 :            :  *      hb -> futex_q, relation
    1045                 :            :  *      futex_q -> pi_state, relation
    1046                 :            :  *
    1047                 :            :  *      (cannot be raw because hb can contain arbitrary amount
    1048                 :            :  *       of futex_q's)
    1049                 :            :  *
    1050                 :            :  * pi_mutex->wait_lock:
    1051                 :            :  *
    1052                 :            :  *      {uval, pi_state}
    1053                 :            :  *
    1054                 :            :  *      (and pi_mutex 'obviously')
    1055                 :            :  *
    1056                 :            :  * p->pi_lock:
    1057                 :            :  *
    1058                 :            :  *      p->pi_state_list -> pi_state->list, relation
    1059                 :            :  *
    1060                 :            :  * pi_state->refcount:
    1061                 :            :  *
    1062                 :            :  *      pi_state lifetime
    1063                 :            :  *
    1064                 :            :  *
    1065                 :            :  * Lock order:
    1066                 :            :  *
    1067                 :            :  *   hb->lock
    1068                 :            :  *     pi_mutex->wait_lock
    1069                 :            :  *       p->pi_lock
    1070                 :            :  *
    1071                 :            :  */
    1072                 :            : 
    1073                 :            : /*
    1074                 :            :  * Validate that the existing waiter has a pi_state and sanity check
    1075                 :            :  * the pi_state against the user space value. If correct, attach to
    1076                 :            :  * it.
    1077                 :            :  */
    1078                 :          0 : static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
    1079                 :            :                               struct futex_pi_state *pi_state,
    1080                 :            :                               struct futex_pi_state **ps)
    1081                 :            : {
    1082                 :          0 :         pid_t pid = uval & FUTEX_TID_MASK;
    1083                 :            :         u32 uval2;
    1084                 :            :         int ret;
    1085                 :            : 
    1086                 :            :         /*
    1087                 :            :          * Userspace might have messed up non-PI and PI futexes [3]
    1088                 :            :          */
    1089                 :          0 :         if (unlikely(!pi_state))
    1090                 :            :                 return -EINVAL;
    1091                 :            : 
    1092                 :            :         /*
    1093                 :            :          * We get here with hb->lock held, and having found a
    1094                 :            :          * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
    1095                 :            :          * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
    1096                 :            :          * which in turn means that futex_lock_pi() still has a reference on
    1097                 :            :          * our pi_state.
    1098                 :            :          *
    1099                 :            :          * The waiter holding a reference on @pi_state also protects against
    1100                 :            :          * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
    1101                 :            :          * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
    1102                 :            :          * free pi_state before we can take a reference ourselves.
    1103                 :            :          */
    1104                 :          0 :         WARN_ON(!refcount_read(&pi_state->refcount));
    1105                 :            : 
    1106                 :            :         /*
    1107                 :            :          * Now that we have a pi_state, we can acquire wait_lock
    1108                 :            :          * and do the state validation.
    1109                 :            :          */
    1110                 :          0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    1111                 :            : 
    1112                 :            :         /*
    1113                 :            :          * Since {uval, pi_state} is serialized by wait_lock, and our current
    1114                 :            :          * uval was read without holding it, it can have changed. Verify it
    1115                 :            :          * still is what we expect it to be, otherwise retry the entire
    1116                 :            :          * operation.
    1117                 :            :          */
    1118                 :          0 :         if (get_futex_value_locked(&uval2, uaddr))
    1119                 :            :                 goto out_efault;
    1120                 :            : 
    1121                 :          0 :         if (uval != uval2)
    1122                 :            :                 goto out_eagain;
    1123                 :            : 
    1124                 :            :         /*
    1125                 :            :          * Handle the owner died case:
    1126                 :            :          */
    1127                 :          0 :         if (uval & FUTEX_OWNER_DIED) {
    1128                 :            :                 /*
    1129                 :            :                  * exit_pi_state_list sets owner to NULL and wakes the
    1130                 :            :                  * topmost waiter. The task which acquires the
    1131                 :            :                  * pi_state->rt_mutex will fixup owner.
    1132                 :            :                  */
    1133                 :          0 :                 if (!pi_state->owner) {
    1134                 :            :                         /*
    1135                 :            :                          * No pi state owner, but the user space TID
    1136                 :            :                          * is not 0. Inconsistent state. [5]
    1137                 :            :                          */
    1138                 :          0 :                         if (pid)
    1139                 :            :                                 goto out_einval;
    1140                 :            :                         /*
    1141                 :            :                          * Take a ref on the state and return success. [4]
    1142                 :            :                          */
    1143                 :            :                         goto out_attach;
    1144                 :            :                 }
    1145                 :            : 
    1146                 :            :                 /*
    1147                 :            :                  * If TID is 0, then either the dying owner has not
    1148                 :            :                  * yet executed exit_pi_state_list() or some waiter
    1149                 :            :                  * acquired the rtmutex in the pi state, but did not
    1150                 :            :                  * yet fixup the TID in user space.
    1151                 :            :                  *
    1152                 :            :                  * Take a ref on the state and return success. [6]
    1153                 :            :                  */
    1154                 :          0 :                 if (!pid)
    1155                 :            :                         goto out_attach;
    1156                 :            :         } else {
    1157                 :            :                 /*
    1158                 :            :                  * If the owner died bit is not set, then the pi_state
    1159                 :            :                  * must have an owner. [7]
    1160                 :            :                  */
    1161                 :          0 :                 if (!pi_state->owner)
    1162                 :            :                         goto out_einval;
    1163                 :            :         }
    1164                 :            : 
    1165                 :            :         /*
    1166                 :            :          * Bail out if user space manipulated the futex value. If pi
    1167                 :            :          * state exists then the owner TID must be the same as the
    1168                 :            :          * user space TID. [9/10]
    1169                 :            :          */
    1170                 :          0 :         if (pid != task_pid_vnr(pi_state->owner))
    1171                 :            :                 goto out_einval;
    1172                 :            : 
    1173                 :            : out_attach:
    1174                 :          0 :         get_pi_state(pi_state);
    1175                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1176                 :          0 :         *ps = pi_state;
    1177                 :          0 :         return 0;
    1178                 :            : 
    1179                 :            : out_einval:
    1180                 :            :         ret = -EINVAL;
    1181                 :            :         goto out_error;
    1182                 :            : 
    1183                 :            : out_eagain:
    1184                 :            :         ret = -EAGAIN;
    1185                 :            :         goto out_error;
    1186                 :            : 
    1187                 :            : out_efault:
    1188                 :            :         ret = -EFAULT;
    1189                 :            :         goto out_error;
    1190                 :            : 
    1191                 :            : out_error:
    1192                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1193                 :          0 :         return ret;
    1194                 :            : }
    1195                 :            : 
    1196                 :            : /**
    1197                 :            :  * wait_for_owner_exiting - Block until the owner has exited
    1198                 :            :  * @exiting:    Pointer to the exiting task
    1199                 :            :  *
    1200                 :            :  * Caller must hold a refcount on @exiting.
    1201                 :            :  */
    1202                 :          0 : static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
    1203                 :            : {
    1204                 :          0 :         if (ret != -EBUSY) {
    1205                 :          0 :                 WARN_ON_ONCE(exiting);
    1206                 :            :                 return;
    1207                 :            :         }
    1208                 :            : 
    1209                 :          0 :         if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
    1210                 :            :                 return;
    1211                 :            : 
    1212                 :          0 :         mutex_lock(&exiting->futex_exit_mutex);
    1213                 :            :         /*
    1214                 :            :          * No point in doing state checking here. If the waiter got here
    1215                 :            :          * while the task was in exec()->exec_futex_release() then it can
    1216                 :            :          * have any FUTEX_STATE_* value when the waiter has acquired the
    1217                 :            :          * mutex. OK, if running, EXITING or DEAD if it reached exit()
    1218                 :            :          * already. Highly unlikely and not a problem. Just one more round
    1219                 :            :          * through the futex maze.
    1220                 :            :          */
    1221                 :          0 :         mutex_unlock(&exiting->futex_exit_mutex);
    1222                 :            : 
    1223                 :          0 :         put_task_struct(exiting);
    1224                 :            : }
    1225                 :            : 
    1226                 :          0 : static int handle_exit_race(u32 __user *uaddr, u32 uval,
    1227                 :            :                             struct task_struct *tsk)
    1228                 :            : {
    1229                 :            :         u32 uval2;
    1230                 :            : 
    1231                 :            :         /*
    1232                 :            :          * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
    1233                 :            :          * caller that the alleged owner is busy.
    1234                 :            :          */
    1235                 :          0 :         if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
    1236                 :            :                 return -EBUSY;
    1237                 :            : 
    1238                 :            :         /*
    1239                 :            :          * Reread the user space value to handle the following situation:
    1240                 :            :          *
    1241                 :            :          * CPU0                         CPU1
    1242                 :            :          *
    1243                 :            :          * sys_exit()                   sys_futex()
    1244                 :            :          *  do_exit()                    futex_lock_pi()
    1245                 :            :          *                                futex_lock_pi_atomic()
    1246                 :            :          *   exit_signals(tsk)              No waiters:
    1247                 :            :          *    tsk->flags |= PF_EXITING;          *uaddr == 0x00000PID
    1248                 :            :          *  mm_release(tsk)                 Set waiter bit
    1249                 :            :          *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
    1250                 :            :          *      Set owner died              attach_to_pi_owner() {
    1251                 :            :          *    *uaddr = 0xC0000000;           tsk = get_task(PID);
    1252                 :            :          *   }                               if (!tsk->flags & PF_EXITING) {
    1253                 :            :          *  ...                                attach();
    1254                 :            :          *  tsk->futex_state =               } else {
    1255                 :            :          *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
    1256                 :            :          *                                        FUTEX_STATE_DEAD)
    1257                 :            :          *                                       return -EAGAIN;
    1258                 :            :          *                                     return -ESRCH; <--- FAIL
    1259                 :            :          *                                   }
    1260                 :            :          *
    1261                 :            :          * Returning ESRCH unconditionally is wrong here because the
    1262                 :            :          * user space value has been changed by the exiting task.
    1263                 :            :          *
    1264                 :            :          * The same logic applies to the case where the exiting task is
    1265                 :            :          * already gone.
    1266                 :            :          */
    1267                 :          0 :         if (get_futex_value_locked(&uval2, uaddr))
    1268                 :            :                 return -EFAULT;
    1269                 :            : 
    1270                 :            :         /* If the user space value has changed, try again. */
    1271                 :          0 :         if (uval2 != uval)
    1272                 :            :                 return -EAGAIN;
    1273                 :            : 
    1274                 :            :         /*
    1275                 :            :          * The exiting task did not have a robust list, the robust list was
    1276                 :            :          * corrupted or the user space value in *uaddr is simply bogus.
    1277                 :            :          * Give up and tell user space.
    1278                 :            :          */
    1279                 :          0 :         return -ESRCH;
    1280                 :            : }
    1281                 :            : 
    1282                 :            : /*
    1283                 :            :  * Lookup the task for the TID provided from user space and attach to
    1284                 :            :  * it after doing proper sanity checks.
    1285                 :            :  */
    1286                 :          0 : static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
    1287                 :            :                               struct futex_pi_state **ps,
    1288                 :            :                               struct task_struct **exiting)
    1289                 :            : {
    1290                 :          0 :         pid_t pid = uval & FUTEX_TID_MASK;
    1291                 :            :         struct futex_pi_state *pi_state;
    1292                 :            :         struct task_struct *p;
    1293                 :            : 
    1294                 :            :         /*
    1295                 :            :          * We are the first waiter - try to look up the real owner and attach
    1296                 :            :          * the new pi_state to it, but bail out when TID = 0 [1]
    1297                 :            :          *
    1298                 :            :          * The !pid check is paranoid. None of the call sites should end up
    1299                 :            :          * with pid == 0, but better safe than sorry. Let the caller retry
    1300                 :            :          */
    1301                 :          0 :         if (!pid)
    1302                 :            :                 return -EAGAIN;
    1303                 :          0 :         p = find_get_task_by_vpid(pid);
    1304                 :          0 :         if (!p)
    1305                 :          0 :                 return handle_exit_race(uaddr, uval, NULL);
    1306                 :            : 
    1307                 :          0 :         if (unlikely(p->flags & PF_KTHREAD)) {
    1308                 :          0 :                 put_task_struct(p);
    1309                 :          0 :                 return -EPERM;
    1310                 :            :         }
    1311                 :            : 
    1312                 :            :         /*
    1313                 :            :          * We need to look at the task state to figure out, whether the
    1314                 :            :          * task is exiting. To protect against the change of the task state
    1315                 :            :          * in futex_exit_release(), we do this protected by p->pi_lock:
    1316                 :            :          */
    1317                 :          0 :         raw_spin_lock_irq(&p->pi_lock);
    1318                 :          0 :         if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
    1319                 :            :                 /*
    1320                 :            :                  * The task is on the way out. When the futex state is
    1321                 :            :                  * FUTEX_STATE_DEAD, we know that the task has finished
    1322                 :            :                  * the cleanup:
    1323                 :            :                  */
    1324                 :          0 :                 int ret = handle_exit_race(uaddr, uval, p);
    1325                 :            : 
    1326                 :          0 :                 raw_spin_unlock_irq(&p->pi_lock);
    1327                 :            :                 /*
    1328                 :            :                  * If the owner task is between FUTEX_STATE_EXITING and
    1329                 :            :                  * FUTEX_STATE_DEAD then store the task pointer and keep
    1330                 :            :                  * the reference on the task struct. The calling code will
    1331                 :            :                  * drop all locks, wait for the task to reach
    1332                 :            :                  * FUTEX_STATE_DEAD and then drop the refcount. This is
    1333                 :            :                  * required to prevent a live lock when the current task
    1334                 :            :                  * preempted the exiting task between the two states.
    1335                 :            :                  */
    1336                 :          0 :                 if (ret == -EBUSY)
    1337                 :          0 :                         *exiting = p;
    1338                 :            :                 else
    1339                 :          0 :                         put_task_struct(p);
    1340                 :          0 :                 return ret;
    1341                 :            :         }
    1342                 :            : 
    1343                 :            :         /*
    1344                 :            :          * No existing pi state. First waiter. [2]
    1345                 :            :          *
    1346                 :            :          * This creates pi_state, we have hb->lock held, this means nothing can
    1347                 :            :          * observe this state, wait_lock is irrelevant.
    1348                 :            :          */
    1349                 :          0 :         pi_state = alloc_pi_state();
    1350                 :            : 
    1351                 :            :         /*
    1352                 :            :          * Initialize the pi_mutex in locked state and make @p
    1353                 :            :          * the owner of it:
    1354                 :            :          */
    1355                 :          0 :         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
    1356                 :            : 
    1357                 :            :         /* Store the key for possible exit cleanups: */
    1358                 :          0 :         pi_state->key = *key;
    1359                 :            : 
    1360                 :          0 :         WARN_ON(!list_empty(&pi_state->list));
    1361                 :          0 :         list_add(&pi_state->list, &p->pi_state_list);
    1362                 :            :         /*
    1363                 :            :          * Assignment without holding pi_state->pi_mutex.wait_lock is safe
    1364                 :            :          * because there is no concurrency as the object is not published yet.
    1365                 :            :          */
    1366                 :          0 :         pi_state->owner = p;
    1367                 :          0 :         raw_spin_unlock_irq(&p->pi_lock);
    1368                 :            : 
    1369                 :          0 :         put_task_struct(p);
    1370                 :            : 
    1371                 :          0 :         *ps = pi_state;
    1372                 :            : 
    1373                 :          0 :         return 0;
    1374                 :            : }
    1375                 :            : 
    1376                 :          0 : static int lookup_pi_state(u32 __user *uaddr, u32 uval,
    1377                 :            :                            struct futex_hash_bucket *hb,
    1378                 :            :                            union futex_key *key, struct futex_pi_state **ps,
    1379                 :            :                            struct task_struct **exiting)
    1380                 :            : {
    1381                 :          0 :         struct futex_q *top_waiter = futex_top_waiter(hb, key);
    1382                 :            : 
    1383                 :            :         /*
    1384                 :            :          * If there is a waiter on that futex, validate it and
    1385                 :            :          * attach to the pi_state when the validation succeeds.
    1386                 :            :          */
    1387                 :          0 :         if (top_waiter)
    1388                 :          0 :                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
    1389                 :            : 
    1390                 :            :         /*
    1391                 :            :          * We are the first waiter - try to look up the owner based on
    1392                 :            :          * @uval and attach to it.
    1393                 :            :          */
    1394                 :          0 :         return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
    1395                 :            : }
    1396                 :            : 
    1397                 :          0 : static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
    1398                 :            : {
    1399                 :            :         int err;
    1400                 :            :         u32 uninitialized_var(curval);
    1401                 :            : 
    1402                 :            :         if (unlikely(should_fail_futex(true)))
    1403                 :            :                 return -EFAULT;
    1404                 :            : 
    1405                 :          0 :         err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    1406                 :          0 :         if (unlikely(err))
    1407                 :            :                 return err;
    1408                 :            : 
    1409                 :            :         /* If user space value changed, let the caller retry */
    1410                 :          0 :         return curval != uval ? -EAGAIN : 0;
    1411                 :            : }
    1412                 :            : 
    1413                 :            : /**
    1414                 :            :  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
    1415                 :            :  * @uaddr:              the pi futex user address
    1416                 :            :  * @hb:                 the pi futex hash bucket
    1417                 :            :  * @key:                the futex key associated with uaddr and hb
    1418                 :            :  * @ps:                 the pi_state pointer where we store the result of the
    1419                 :            :  *                      lookup
    1420                 :            :  * @task:               the task to perform the atomic lock work for.  This will
    1421                 :            :  *                      be "current" except in the case of requeue pi.
    1422                 :            :  * @exiting:            Pointer to store the task pointer of the owner task
    1423                 :            :  *                      which is in the middle of exiting
    1424                 :            :  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
    1425                 :            :  *
    1426                 :            :  * Return:
    1427                 :            :  *  -  0 - ready to wait;
    1428                 :            :  *  -  1 - acquired the lock;
    1429                 :            :  *  - <0 - error
    1430                 :            :  *
    1431                 :            :  * The hb->lock and futex_key refs shall be held by the caller.
    1432                 :            :  *
    1433                 :            :  * @exiting is only set when the return value is -EBUSY. If so, this holds
    1434                 :            :  * a refcount on the exiting task on return and the caller needs to drop it
    1435                 :            :  * after waiting for the exit to complete.
    1436                 :            :  */
    1437                 :          0 : static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
    1438                 :            :                                 union futex_key *key,
    1439                 :            :                                 struct futex_pi_state **ps,
    1440                 :            :                                 struct task_struct *task,
    1441                 :            :                                 struct task_struct **exiting,
    1442                 :            :                                 int set_waiters)
    1443                 :            : {
    1444                 :          0 :         u32 uval, newval, vpid = task_pid_vnr(task);
    1445                 :            :         struct futex_q *top_waiter;
    1446                 :            :         int ret;
    1447                 :            : 
    1448                 :            :         /*
    1449                 :            :          * Read the user space value first so we can validate a few
    1450                 :            :          * things before proceeding further.
    1451                 :            :          */
    1452                 :          0 :         if (get_futex_value_locked(&uval, uaddr))
    1453                 :            :                 return -EFAULT;
    1454                 :            : 
    1455                 :            :         if (unlikely(should_fail_futex(true)))
    1456                 :            :                 return -EFAULT;
    1457                 :            : 
    1458                 :            :         /*
    1459                 :            :          * Detect deadlocks.
    1460                 :            :          */
    1461                 :          0 :         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
    1462                 :            :                 return -EDEADLK;
    1463                 :            : 
    1464                 :            :         if ((unlikely(should_fail_futex(true))))
    1465                 :            :                 return -EDEADLK;
    1466                 :            : 
    1467                 :            :         /*
    1468                 :            :          * Lookup existing state first. If it exists, try to attach to
    1469                 :            :          * its pi_state.
    1470                 :            :          */
    1471                 :          0 :         top_waiter = futex_top_waiter(hb, key);
    1472                 :          0 :         if (top_waiter)
    1473                 :          0 :                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
    1474                 :            : 
    1475                 :            :         /*
    1476                 :            :          * No waiter and user TID is 0. We are here because the
    1477                 :            :          * waiters or the owner died bit is set or called from
    1478                 :            :          * requeue_cmp_pi or for whatever reason something took the
    1479                 :            :          * syscall.
    1480                 :            :          */
    1481                 :          0 :         if (!(uval & FUTEX_TID_MASK)) {
    1482                 :            :                 /*
    1483                 :            :                  * We take over the futex. No other waiters and the user space
    1484                 :            :                  * TID is 0. We preserve the owner died bit.
    1485                 :            :                  */
    1486                 :          0 :                 newval = uval & FUTEX_OWNER_DIED;
    1487                 :          0 :                 newval |= vpid;
    1488                 :            : 
    1489                 :            :                 /* The futex requeue_pi code can enforce the waiters bit */
    1490                 :          0 :                 if (set_waiters)
    1491                 :          0 :                         newval |= FUTEX_WAITERS;
    1492                 :            : 
    1493                 :          0 :                 ret = lock_pi_update_atomic(uaddr, uval, newval);
    1494                 :            :                 /* If the take over worked, return 1 */
    1495                 :          0 :                 return ret < 0 ? ret : 1;
    1496                 :            :         }
    1497                 :            : 
    1498                 :            :         /*
    1499                 :            :          * First waiter. Set the waiters bit before attaching ourself to
    1500                 :            :          * the owner. If owner tries to unlock, it will be forced into
    1501                 :            :          * the kernel and blocked on hb->lock.
    1502                 :            :          */
    1503                 :          0 :         newval = uval | FUTEX_WAITERS;
    1504                 :          0 :         ret = lock_pi_update_atomic(uaddr, uval, newval);
    1505                 :          0 :         if (ret)
    1506                 :            :                 return ret;
    1507                 :            :         /*
    1508                 :            :          * If the update of the user space value succeeded, we try to
    1509                 :            :          * attach to the owner. If that fails, no harm done, we only
    1510                 :            :          * set the FUTEX_WAITERS bit in the user space variable.
    1511                 :            :          */
    1512                 :          0 :         return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
    1513                 :            : }
    1514                 :            : 
    1515                 :            : /**
    1516                 :            :  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
    1517                 :            :  * @q:  The futex_q to unqueue
    1518                 :            :  *
    1519                 :            :  * The q->lock_ptr must not be NULL and must be held by the caller.
    1520                 :            :  */
    1521                 :          3 : static void __unqueue_futex(struct futex_q *q)
    1522                 :            : {
    1523                 :            :         struct futex_hash_bucket *hb;
    1524                 :            : 
    1525                 :          3 :         if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
    1526                 :          3 :                 return;
    1527                 :            :         lockdep_assert_held(q->lock_ptr);
    1528                 :            : 
    1529                 :          3 :         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
    1530                 :          3 :         plist_del(&q->list, &hb->chain);
    1531                 :            :         hb_waiters_dec(hb);
    1532                 :            : }
    1533                 :            : 
    1534                 :            : /*
    1535                 :            :  * The hash bucket lock must be held when this is called.
    1536                 :            :  * Afterwards, the futex_q must not be accessed. Callers
    1537                 :            :  * must ensure to later call wake_up_q() for the actual
    1538                 :            :  * wakeups to occur.
    1539                 :            :  */
    1540                 :          3 : static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
    1541                 :            : {
    1542                 :          3 :         struct task_struct *p = q->task;
    1543                 :            : 
    1544                 :          3 :         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
    1545                 :          3 :                 return;
    1546                 :            : 
    1547                 :            :         get_task_struct(p);
    1548                 :          3 :         __unqueue_futex(q);
    1549                 :            :         /*
    1550                 :            :          * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
    1551                 :            :          * is written, without taking any locks. This is possible in the event
    1552                 :            :          * of a spurious wakeup, for example. A memory barrier is required here
    1553                 :            :          * to prevent the following store to lock_ptr from getting ahead of the
    1554                 :            :          * plist_del in __unqueue_futex().
    1555                 :            :          */
    1556                 :          3 :         smp_store_release(&q->lock_ptr, NULL);
    1557                 :            : 
    1558                 :            :         /*
    1559                 :            :          * Queue the task for later wakeup for after we've released
    1560                 :            :          * the hb->lock. wake_q_add() grabs reference to p.
    1561                 :            :          */
    1562                 :          3 :         wake_q_add_safe(wake_q, p);
    1563                 :            : }
    1564                 :            : 
    1565                 :            : /*
    1566                 :            :  * Caller must hold a reference on @pi_state.
    1567                 :            :  */
    1568                 :          0 : static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
    1569                 :            : {
    1570                 :            :         u32 uninitialized_var(curval), newval;
    1571                 :            :         struct task_struct *new_owner;
    1572                 :            :         bool postunlock = false;
    1573                 :          0 :         DEFINE_WAKE_Q(wake_q);
    1574                 :            :         int ret = 0;
    1575                 :            : 
    1576                 :          0 :         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
    1577                 :          0 :         if (WARN_ON_ONCE(!new_owner)) {
    1578                 :            :                 /*
    1579                 :            :                  * As per the comment in futex_unlock_pi() this should not happen.
    1580                 :            :                  *
    1581                 :            :                  * When this happens, give up our locks and try again, giving
    1582                 :            :                  * the futex_lock_pi() instance time to complete, either by
    1583                 :            :                  * waiting on the rtmutex or removing itself from the futex
    1584                 :            :                  * queue.
    1585                 :            :                  */
    1586                 :            :                 ret = -EAGAIN;
    1587                 :            :                 goto out_unlock;
    1588                 :            :         }
    1589                 :            : 
    1590                 :            :         /*
    1591                 :            :          * We pass it to the next owner. The WAITERS bit is always kept
    1592                 :            :          * enabled while there is PI state around. We cleanup the owner
    1593                 :            :          * died bit, because we are the owner.
    1594                 :            :          */
    1595                 :          0 :         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
    1596                 :            : 
    1597                 :            :         if (unlikely(should_fail_futex(true)))
    1598                 :            :                 ret = -EFAULT;
    1599                 :            : 
    1600                 :          0 :         ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    1601                 :          0 :         if (!ret && (curval != uval)) {
    1602                 :            :                 /*
    1603                 :            :                  * If a unconditional UNLOCK_PI operation (user space did not
    1604                 :            :                  * try the TID->0 transition) raced with a waiter setting the
    1605                 :            :                  * FUTEX_WAITERS flag between get_user() and locking the hash
    1606                 :            :                  * bucket lock, retry the operation.
    1607                 :            :                  */
    1608                 :          0 :                 if ((FUTEX_TID_MASK & curval) == uval)
    1609                 :            :                         ret = -EAGAIN;
    1610                 :            :                 else
    1611                 :            :                         ret = -EINVAL;
    1612                 :            :         }
    1613                 :            : 
    1614                 :          0 :         if (ret)
    1615                 :            :                 goto out_unlock;
    1616                 :            : 
    1617                 :            :         /*
    1618                 :            :          * This is a point of no return; once we modify the uval there is no
    1619                 :            :          * going back and subsequent operations must not fail.
    1620                 :            :          */
    1621                 :            : 
    1622                 :          0 :         raw_spin_lock(&pi_state->owner->pi_lock);
    1623                 :          0 :         WARN_ON(list_empty(&pi_state->list));
    1624                 :            :         list_del_init(&pi_state->list);
    1625                 :          0 :         raw_spin_unlock(&pi_state->owner->pi_lock);
    1626                 :            : 
    1627                 :          0 :         raw_spin_lock(&new_owner->pi_lock);
    1628                 :          0 :         WARN_ON(!list_empty(&pi_state->list));
    1629                 :          0 :         list_add(&pi_state->list, &new_owner->pi_state_list);
    1630                 :          0 :         pi_state->owner = new_owner;
    1631                 :            :         raw_spin_unlock(&new_owner->pi_lock);
    1632                 :            : 
    1633                 :          0 :         postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
    1634                 :            : 
    1635                 :            : out_unlock:
    1636                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    1637                 :            : 
    1638                 :          0 :         if (postunlock)
    1639                 :          0 :                 rt_mutex_postunlock(&wake_q);
    1640                 :            : 
    1641                 :          0 :         return ret;
    1642                 :            : }
    1643                 :            : 
    1644                 :            : /*
    1645                 :            :  * Express the locking dependencies for lockdep:
    1646                 :            :  */
    1647                 :            : static inline void
    1648                 :          0 : double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
    1649                 :            : {
    1650                 :          0 :         if (hb1 <= hb2) {
    1651                 :            :                 spin_lock(&hb1->lock);
    1652                 :          0 :                 if (hb1 < hb2)
    1653                 :          0 :                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
    1654                 :            :         } else { /* hb1 > hb2 */
    1655                 :            :                 spin_lock(&hb2->lock);
    1656                 :          0 :                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
    1657                 :            :         }
    1658                 :          0 : }
    1659                 :            : 
    1660                 :            : static inline void
    1661                 :            : double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
    1662                 :            : {
    1663                 :            :         spin_unlock(&hb1->lock);
    1664                 :          0 :         if (hb1 != hb2)
    1665                 :            :                 spin_unlock(&hb2->lock);
    1666                 :            : }
    1667                 :            : 
    1668                 :            : /*
    1669                 :            :  * Wake up waiters matching bitset queued on this futex (uaddr).
    1670                 :            :  */
    1671                 :            : static int
    1672                 :          3 : futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
    1673                 :            : {
    1674                 :            :         struct futex_hash_bucket *hb;
    1675                 :            :         struct futex_q *this, *next;
    1676                 :          3 :         union futex_key key = FUTEX_KEY_INIT;
    1677                 :            :         int ret;
    1678                 :          3 :         DEFINE_WAKE_Q(wake_q);
    1679                 :            : 
    1680                 :          3 :         if (!bitset)
    1681                 :            :                 return -EINVAL;
    1682                 :            : 
    1683                 :          3 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
    1684                 :          3 :         if (unlikely(ret != 0))
    1685                 :            :                 goto out;
    1686                 :            : 
    1687                 :          3 :         hb = hash_futex(&key);
    1688                 :            : 
    1689                 :            :         /* Make sure we really have tasks to wakeup */
    1690                 :          3 :         if (!hb_waiters_pending(hb))
    1691                 :            :                 goto out_put_key;
    1692                 :            : 
    1693                 :            :         spin_lock(&hb->lock);
    1694                 :            : 
    1695                 :          3 :         plist_for_each_entry_safe(this, next, &hb->chain, list) {
    1696                 :          3 :                 if (match_futex (&this->key, &key)) {
    1697                 :          3 :                         if (this->pi_state || this->rt_waiter) {
    1698                 :            :                                 ret = -EINVAL;
    1699                 :            :                                 break;
    1700                 :            :                         }
    1701                 :            : 
    1702                 :            :                         /* Check if one of the bits is set in both bitsets */
    1703                 :          3 :                         if (!(this->bitset & bitset))
    1704                 :          0 :                                 continue;
    1705                 :            : 
    1706                 :          3 :                         mark_wake_futex(&wake_q, this);
    1707                 :          3 :                         if (++ret >= nr_wake)
    1708                 :            :                                 break;
    1709                 :            :                 }
    1710                 :            :         }
    1711                 :            : 
    1712                 :            :         spin_unlock(&hb->lock);
    1713                 :          3 :         wake_up_q(&wake_q);
    1714                 :            : out_put_key:
    1715                 :            :         put_futex_key(&key);
    1716                 :            : out:
    1717                 :          3 :         return ret;
    1718                 :            : }
    1719                 :            : 
    1720                 :          0 : static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
    1721                 :            : {
    1722                 :          0 :         unsigned int op =         (encoded_op & 0x70000000) >> 28;
    1723                 :          0 :         unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
    1724                 :          0 :         int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
    1725                 :          0 :         int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
    1726                 :            :         int oldval, ret;
    1727                 :            : 
    1728                 :          0 :         if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
    1729                 :          0 :                 if (oparg < 0 || oparg > 31) {
    1730                 :            :                         char comm[sizeof(current->comm)];
    1731                 :            :                         /*
    1732                 :            :                          * kill this print and return -EINVAL when userspace
    1733                 :            :                          * is sane again
    1734                 :            :                          */
    1735                 :          0 :                         pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
    1736                 :            :                                         get_task_comm(comm, current), oparg);
    1737                 :          0 :                         oparg &= 31;
    1738                 :            :                 }
    1739                 :          0 :                 oparg = 1 << oparg;
    1740                 :            :         }
    1741                 :            : 
    1742                 :          0 :         if (!access_ok(uaddr, sizeof(u32)))
    1743                 :            :                 return -EFAULT;
    1744                 :            : 
    1745                 :          0 :         ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
    1746                 :          0 :         if (ret)
    1747                 :            :                 return ret;
    1748                 :            : 
    1749                 :          0 :         switch (cmp) {
    1750                 :            :         case FUTEX_OP_CMP_EQ:
    1751                 :          0 :                 return oldval == cmparg;
    1752                 :            :         case FUTEX_OP_CMP_NE:
    1753                 :          0 :                 return oldval != cmparg;
    1754                 :            :         case FUTEX_OP_CMP_LT:
    1755                 :          0 :                 return oldval < cmparg;
    1756                 :            :         case FUTEX_OP_CMP_GE:
    1757                 :          0 :                 return oldval >= cmparg;
    1758                 :            :         case FUTEX_OP_CMP_LE:
    1759                 :          0 :                 return oldval <= cmparg;
    1760                 :            :         case FUTEX_OP_CMP_GT:
    1761                 :          0 :                 return oldval > cmparg;
    1762                 :            :         default:
    1763                 :            :                 return -ENOSYS;
    1764                 :            :         }
    1765                 :            : }
    1766                 :            : 
    1767                 :            : /*
    1768                 :            :  * Wake up all waiters hashed on the physical page that is mapped
    1769                 :            :  * to this virtual address:
    1770                 :            :  */
    1771                 :            : static int
    1772                 :          0 : futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
    1773                 :            :               int nr_wake, int nr_wake2, int op)
    1774                 :            : {
    1775                 :          0 :         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
    1776                 :            :         struct futex_hash_bucket *hb1, *hb2;
    1777                 :            :         struct futex_q *this, *next;
    1778                 :            :         int ret, op_ret;
    1779                 :          0 :         DEFINE_WAKE_Q(wake_q);
    1780                 :            : 
    1781                 :            : retry:
    1782                 :          0 :         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
    1783                 :          0 :         if (unlikely(ret != 0))
    1784                 :            :                 goto out;
    1785                 :          0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
    1786                 :          0 :         if (unlikely(ret != 0))
    1787                 :            :                 goto out_put_key1;
    1788                 :            : 
    1789                 :          0 :         hb1 = hash_futex(&key1);
    1790                 :          0 :         hb2 = hash_futex(&key2);
    1791                 :            : 
    1792                 :            : retry_private:
    1793                 :          0 :         double_lock_hb(hb1, hb2);
    1794                 :          0 :         op_ret = futex_atomic_op_inuser(op, uaddr2);
    1795                 :          0 :         if (unlikely(op_ret < 0)) {
    1796                 :            :                 double_unlock_hb(hb1, hb2);
    1797                 :            : 
    1798                 :          0 :                 if (!IS_ENABLED(CONFIG_MMU) ||
    1799                 :          0 :                     unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
    1800                 :            :                         /*
    1801                 :            :                          * we don't get EFAULT from MMU faults if we don't have
    1802                 :            :                          * an MMU, but we might get them from range checking
    1803                 :            :                          */
    1804                 :          0 :                         ret = op_ret;
    1805                 :          0 :                         goto out_put_keys;
    1806                 :            :                 }
    1807                 :            : 
    1808                 :          0 :                 if (op_ret == -EFAULT) {
    1809                 :          0 :                         ret = fault_in_user_writeable(uaddr2);
    1810                 :          0 :                         if (ret)
    1811                 :            :                                 goto out_put_keys;
    1812                 :            :                 }
    1813                 :            : 
    1814                 :          0 :                 if (!(flags & FLAGS_SHARED)) {
    1815                 :          0 :                         cond_resched();
    1816                 :          0 :                         goto retry_private;
    1817                 :            :                 }
    1818                 :            : 
    1819                 :            :                 put_futex_key(&key2);
    1820                 :            :                 put_futex_key(&key1);
    1821                 :          0 :                 cond_resched();
    1822                 :          0 :                 goto retry;
    1823                 :            :         }
    1824                 :            : 
    1825                 :          0 :         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
    1826                 :          0 :                 if (match_futex (&this->key, &key1)) {
    1827                 :          0 :                         if (this->pi_state || this->rt_waiter) {
    1828                 :            :                                 ret = -EINVAL;
    1829                 :            :                                 goto out_unlock;
    1830                 :            :                         }
    1831                 :          0 :                         mark_wake_futex(&wake_q, this);
    1832                 :          0 :                         if (++ret >= nr_wake)
    1833                 :            :                                 break;
    1834                 :            :                 }
    1835                 :            :         }
    1836                 :            : 
    1837                 :          0 :         if (op_ret > 0) {
    1838                 :            :                 op_ret = 0;
    1839                 :          0 :                 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
    1840                 :          0 :                         if (match_futex (&this->key, &key2)) {
    1841                 :          0 :                                 if (this->pi_state || this->rt_waiter) {
    1842                 :            :                                         ret = -EINVAL;
    1843                 :            :                                         goto out_unlock;
    1844                 :            :                                 }
    1845                 :          0 :                                 mark_wake_futex(&wake_q, this);
    1846                 :          0 :                                 if (++op_ret >= nr_wake2)
    1847                 :            :                                         break;
    1848                 :            :                         }
    1849                 :            :                 }
    1850                 :          0 :                 ret += op_ret;
    1851                 :            :         }
    1852                 :            : 
    1853                 :            : out_unlock:
    1854                 :            :         double_unlock_hb(hb1, hb2);
    1855                 :          0 :         wake_up_q(&wake_q);
    1856                 :            : out_put_keys:
    1857                 :            :         put_futex_key(&key2);
    1858                 :            : out_put_key1:
    1859                 :            :         put_futex_key(&key1);
    1860                 :            : out:
    1861                 :          0 :         return ret;
    1862                 :            : }
    1863                 :            : 
    1864                 :            : /**
    1865                 :            :  * requeue_futex() - Requeue a futex_q from one hb to another
    1866                 :            :  * @q:          the futex_q to requeue
    1867                 :            :  * @hb1:        the source hash_bucket
    1868                 :            :  * @hb2:        the target hash_bucket
    1869                 :            :  * @key2:       the new key for the requeued futex_q
    1870                 :            :  */
    1871                 :            : static inline
    1872                 :          0 : void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
    1873                 :            :                    struct futex_hash_bucket *hb2, union futex_key *key2)
    1874                 :            : {
    1875                 :            : 
    1876                 :            :         /*
    1877                 :            :          * If key1 and key2 hash to the same bucket, no need to
    1878                 :            :          * requeue.
    1879                 :            :          */
    1880                 :          0 :         if (likely(&hb1->chain != &hb2->chain)) {
    1881                 :          0 :                 plist_del(&q->list, &hb1->chain);
    1882                 :            :                 hb_waiters_dec(hb1);
    1883                 :          0 :                 hb_waiters_inc(hb2);
    1884                 :          0 :                 plist_add(&q->list, &hb2->chain);
    1885                 :          0 :                 q->lock_ptr = &hb2->lock;
    1886                 :            :         }
    1887                 :          0 :         get_futex_key_refs(key2);
    1888                 :          0 :         q->key = *key2;
    1889                 :          0 : }
    1890                 :            : 
    1891                 :            : /**
    1892                 :            :  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
    1893                 :            :  * @q:          the futex_q
    1894                 :            :  * @key:        the key of the requeue target futex
    1895                 :            :  * @hb:         the hash_bucket of the requeue target futex
    1896                 :            :  *
    1897                 :            :  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
    1898                 :            :  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
    1899                 :            :  * to the requeue target futex so the waiter can detect the wakeup on the right
    1900                 :            :  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
    1901                 :            :  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
    1902                 :            :  * to protect access to the pi_state to fixup the owner later.  Must be called
    1903                 :            :  * with both q->lock_ptr and hb->lock held.
    1904                 :            :  */
    1905                 :            : static inline
    1906                 :          0 : void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
    1907                 :            :                            struct futex_hash_bucket *hb)
    1908                 :            : {
    1909                 :          0 :         get_futex_key_refs(key);
    1910                 :          0 :         q->key = *key;
    1911                 :            : 
    1912                 :          0 :         __unqueue_futex(q);
    1913                 :            : 
    1914                 :          0 :         WARN_ON(!q->rt_waiter);
    1915                 :          0 :         q->rt_waiter = NULL;
    1916                 :            : 
    1917                 :          0 :         q->lock_ptr = &hb->lock;
    1918                 :            : 
    1919                 :          0 :         wake_up_state(q->task, TASK_NORMAL);
    1920                 :          0 : }
    1921                 :            : 
    1922                 :            : /**
    1923                 :            :  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
    1924                 :            :  * @pifutex:            the user address of the to futex
    1925                 :            :  * @hb1:                the from futex hash bucket, must be locked by the caller
    1926                 :            :  * @hb2:                the to futex hash bucket, must be locked by the caller
    1927                 :            :  * @key1:               the from futex key
    1928                 :            :  * @key2:               the to futex key
    1929                 :            :  * @ps:                 address to store the pi_state pointer
    1930                 :            :  * @exiting:            Pointer to store the task pointer of the owner task
    1931                 :            :  *                      which is in the middle of exiting
    1932                 :            :  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
    1933                 :            :  *
    1934                 :            :  * Try and get the lock on behalf of the top waiter if we can do it atomically.
    1935                 :            :  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
    1936                 :            :  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
    1937                 :            :  * hb1 and hb2 must be held by the caller.
    1938                 :            :  *
    1939                 :            :  * @exiting is only set when the return value is -EBUSY. If so, this holds
    1940                 :            :  * a refcount on the exiting task on return and the caller needs to drop it
    1941                 :            :  * after waiting for the exit to complete.
    1942                 :            :  *
    1943                 :            :  * Return:
    1944                 :            :  *  -  0 - failed to acquire the lock atomically;
    1945                 :            :  *  - >0 - acquired the lock, return value is vpid of the top_waiter
    1946                 :            :  *  - <0 - error
    1947                 :            :  */
    1948                 :            : static int
    1949                 :          0 : futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
    1950                 :            :                            struct futex_hash_bucket *hb2, union futex_key *key1,
    1951                 :            :                            union futex_key *key2, struct futex_pi_state **ps,
    1952                 :            :                            struct task_struct **exiting, int set_waiters)
    1953                 :            : {
    1954                 :            :         struct futex_q *top_waiter = NULL;
    1955                 :            :         u32 curval;
    1956                 :            :         int ret, vpid;
    1957                 :            : 
    1958                 :          0 :         if (get_futex_value_locked(&curval, pifutex))
    1959                 :            :                 return -EFAULT;
    1960                 :            : 
    1961                 :            :         if (unlikely(should_fail_futex(true)))
    1962                 :            :                 return -EFAULT;
    1963                 :            : 
    1964                 :            :         /*
    1965                 :            :          * Find the top_waiter and determine if there are additional waiters.
    1966                 :            :          * If the caller intends to requeue more than 1 waiter to pifutex,
    1967                 :            :          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
    1968                 :            :          * as we have means to handle the possible fault.  If not, don't set
    1969                 :            :          * the bit unecessarily as it will force the subsequent unlock to enter
    1970                 :            :          * the kernel.
    1971                 :            :          */
    1972                 :          0 :         top_waiter = futex_top_waiter(hb1, key1);
    1973                 :            : 
    1974                 :            :         /* There are no waiters, nothing for us to do. */
    1975                 :          0 :         if (!top_waiter)
    1976                 :            :                 return 0;
    1977                 :            : 
    1978                 :            :         /* Ensure we requeue to the expected futex. */
    1979                 :          0 :         if (!match_futex(top_waiter->requeue_pi_key, key2))
    1980                 :            :                 return -EINVAL;
    1981                 :            : 
    1982                 :            :         /*
    1983                 :            :          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
    1984                 :            :          * the contended case or if set_waiters is 1.  The pi_state is returned
    1985                 :            :          * in ps in contended cases.
    1986                 :            :          */
    1987                 :          0 :         vpid = task_pid_vnr(top_waiter->task);
    1988                 :          0 :         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
    1989                 :            :                                    exiting, set_waiters);
    1990                 :          0 :         if (ret == 1) {
    1991                 :          0 :                 requeue_pi_wake_futex(top_waiter, key2, hb2);
    1992                 :          0 :                 return vpid;
    1993                 :            :         }
    1994                 :            :         return ret;
    1995                 :            : }
    1996                 :            : 
    1997                 :            : /**
    1998                 :            :  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
    1999                 :            :  * @uaddr1:     source futex user address
    2000                 :            :  * @flags:      futex flags (FLAGS_SHARED, etc.)
    2001                 :            :  * @uaddr2:     target futex user address
    2002                 :            :  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
    2003                 :            :  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
    2004                 :            :  * @cmpval:     @uaddr1 expected value (or %NULL)
    2005                 :            :  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
    2006                 :            :  *              pi futex (pi to pi requeue is not supported)
    2007                 :            :  *
    2008                 :            :  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
    2009                 :            :  * uaddr2 atomically on behalf of the top waiter.
    2010                 :            :  *
    2011                 :            :  * Return:
    2012                 :            :  *  - >=0 - on success, the number of tasks requeued or woken;
    2013                 :            :  *  -  <0 - on error
    2014                 :            :  */
    2015                 :          0 : static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
    2016                 :            :                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
    2017                 :            :                          u32 *cmpval, int requeue_pi)
    2018                 :            : {
    2019                 :          0 :         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
    2020                 :            :         int drop_count = 0, task_count = 0, ret;
    2021                 :          0 :         struct futex_pi_state *pi_state = NULL;
    2022                 :            :         struct futex_hash_bucket *hb1, *hb2;
    2023                 :            :         struct futex_q *this, *next;
    2024                 :          0 :         DEFINE_WAKE_Q(wake_q);
    2025                 :            : 
    2026                 :          0 :         if (nr_wake < 0 || nr_requeue < 0)
    2027                 :            :                 return -EINVAL;
    2028                 :            : 
    2029                 :            :         /*
    2030                 :            :          * When PI not supported: return -ENOSYS if requeue_pi is true,
    2031                 :            :          * consequently the compiler knows requeue_pi is always false past
    2032                 :            :          * this point which will optimize away all the conditional code
    2033                 :            :          * further down.
    2034                 :            :          */
    2035                 :            :         if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
    2036                 :            :                 return -ENOSYS;
    2037                 :            : 
    2038                 :          0 :         if (requeue_pi) {
    2039                 :            :                 /*
    2040                 :            :                  * Requeue PI only works on two distinct uaddrs. This
    2041                 :            :                  * check is only valid for private futexes. See below.
    2042                 :            :                  */
    2043                 :          0 :                 if (uaddr1 == uaddr2)
    2044                 :            :                         return -EINVAL;
    2045                 :            : 
    2046                 :            :                 /*
    2047                 :            :                  * requeue_pi requires a pi_state, try to allocate it now
    2048                 :            :                  * without any locks in case it fails.
    2049                 :            :                  */
    2050                 :          0 :                 if (refill_pi_state_cache())
    2051                 :            :                         return -ENOMEM;
    2052                 :            :                 /*
    2053                 :            :                  * requeue_pi must wake as many tasks as it can, up to nr_wake
    2054                 :            :                  * + nr_requeue, since it acquires the rt_mutex prior to
    2055                 :            :                  * returning to userspace, so as to not leave the rt_mutex with
    2056                 :            :                  * waiters and no owner.  However, second and third wake-ups
    2057                 :            :                  * cannot be predicted as they involve race conditions with the
    2058                 :            :                  * first wake and a fault while looking up the pi_state.  Both
    2059                 :            :                  * pthread_cond_signal() and pthread_cond_broadcast() should
    2060                 :            :                  * use nr_wake=1.
    2061                 :            :                  */
    2062                 :          0 :                 if (nr_wake != 1)
    2063                 :            :                         return -EINVAL;
    2064                 :            :         }
    2065                 :            : 
    2066                 :            : retry:
    2067                 :          0 :         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
    2068                 :          0 :         if (unlikely(ret != 0))
    2069                 :            :                 goto out;
    2070                 :          0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
    2071                 :            :                             requeue_pi ? FUTEX_WRITE : FUTEX_READ);
    2072                 :          0 :         if (unlikely(ret != 0))
    2073                 :            :                 goto out_put_key1;
    2074                 :            : 
    2075                 :            :         /*
    2076                 :            :          * The check above which compares uaddrs is not sufficient for
    2077                 :            :          * shared futexes. We need to compare the keys:
    2078                 :            :          */
    2079                 :          0 :         if (requeue_pi && match_futex(&key1, &key2)) {
    2080                 :            :                 ret = -EINVAL;
    2081                 :            :                 goto out_put_keys;
    2082                 :            :         }
    2083                 :            : 
    2084                 :          0 :         hb1 = hash_futex(&key1);
    2085                 :          0 :         hb2 = hash_futex(&key2);
    2086                 :            : 
    2087                 :            : retry_private:
    2088                 :          0 :         hb_waiters_inc(hb2);
    2089                 :          0 :         double_lock_hb(hb1, hb2);
    2090                 :            : 
    2091                 :          0 :         if (likely(cmpval != NULL)) {
    2092                 :            :                 u32 curval;
    2093                 :            : 
    2094                 :          0 :                 ret = get_futex_value_locked(&curval, uaddr1);
    2095                 :            : 
    2096                 :          0 :                 if (unlikely(ret)) {
    2097                 :            :                         double_unlock_hb(hb1, hb2);
    2098                 :            :                         hb_waiters_dec(hb2);
    2099                 :            : 
    2100                 :          0 :                         ret = get_user(curval, uaddr1);
    2101                 :          0 :                         if (ret)
    2102                 :            :                                 goto out_put_keys;
    2103                 :            : 
    2104                 :          0 :                         if (!(flags & FLAGS_SHARED))
    2105                 :            :                                 goto retry_private;
    2106                 :            : 
    2107                 :            :                         put_futex_key(&key2);
    2108                 :            :                         put_futex_key(&key1);
    2109                 :          0 :                         goto retry;
    2110                 :            :                 }
    2111                 :          0 :                 if (curval != *cmpval) {
    2112                 :            :                         ret = -EAGAIN;
    2113                 :          0 :                         goto out_unlock;
    2114                 :            :                 }
    2115                 :            :         }
    2116                 :            : 
    2117                 :          0 :         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
    2118                 :          0 :                 struct task_struct *exiting = NULL;
    2119                 :            : 
    2120                 :            :                 /*
    2121                 :            :                  * Attempt to acquire uaddr2 and wake the top waiter. If we
    2122                 :            :                  * intend to requeue waiters, force setting the FUTEX_WAITERS
    2123                 :            :                  * bit.  We force this here where we are able to easily handle
    2124                 :            :                  * faults rather in the requeue loop below.
    2125                 :            :                  */
    2126                 :          0 :                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
    2127                 :            :                                                  &key2, &pi_state,
    2128                 :            :                                                  &exiting, nr_requeue);
    2129                 :            : 
    2130                 :            :                 /*
    2131                 :            :                  * At this point the top_waiter has either taken uaddr2 or is
    2132                 :            :                  * waiting on it.  If the former, then the pi_state will not
    2133                 :            :                  * exist yet, look it up one more time to ensure we have a
    2134                 :            :                  * reference to it. If the lock was taken, ret contains the
    2135                 :            :                  * vpid of the top waiter task.
    2136                 :            :                  * If the lock was not taken, we have pi_state and an initial
    2137                 :            :                  * refcount on it. In case of an error we have nothing.
    2138                 :            :                  */
    2139                 :          0 :                 if (ret > 0) {
    2140                 :          0 :                         WARN_ON(pi_state);
    2141                 :          0 :                         drop_count++;
    2142                 :          0 :                         task_count++;
    2143                 :            :                         /*
    2144                 :            :                          * If we acquired the lock, then the user space value
    2145                 :            :                          * of uaddr2 should be vpid. It cannot be changed by
    2146                 :            :                          * the top waiter as it is blocked on hb2 lock if it
    2147                 :            :                          * tries to do so. If something fiddled with it behind
    2148                 :            :                          * our back the pi state lookup might unearth it. So
    2149                 :            :                          * we rather use the known value than rereading and
    2150                 :            :                          * handing potential crap to lookup_pi_state.
    2151                 :            :                          *
    2152                 :            :                          * If that call succeeds then we have pi_state and an
    2153                 :            :                          * initial refcount on it.
    2154                 :            :                          */
    2155                 :          0 :                         ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
    2156                 :            :                                               &pi_state, &exiting);
    2157                 :            :                 }
    2158                 :            : 
    2159                 :          0 :                 switch (ret) {
    2160                 :            :                 case 0:
    2161                 :            :                         /* We hold a reference on the pi state. */
    2162                 :            :                         break;
    2163                 :            : 
    2164                 :            :                         /* If the above failed, then pi_state is NULL */
    2165                 :            :                 case -EFAULT:
    2166                 :            :                         double_unlock_hb(hb1, hb2);
    2167                 :            :                         hb_waiters_dec(hb2);
    2168                 :            :                         put_futex_key(&key2);
    2169                 :            :                         put_futex_key(&key1);
    2170                 :          0 :                         ret = fault_in_user_writeable(uaddr2);
    2171                 :          0 :                         if (!ret)
    2172                 :            :                                 goto retry;
    2173                 :          0 :                         goto out;
    2174                 :            :                 case -EBUSY:
    2175                 :            :                 case -EAGAIN:
    2176                 :            :                         /*
    2177                 :            :                          * Two reasons for this:
    2178                 :            :                          * - EBUSY: Owner is exiting and we just wait for the
    2179                 :            :                          *   exit to complete.
    2180                 :            :                          * - EAGAIN: The user space value changed.
    2181                 :            :                          */
    2182                 :            :                         double_unlock_hb(hb1, hb2);
    2183                 :            :                         hb_waiters_dec(hb2);
    2184                 :            :                         put_futex_key(&key2);
    2185                 :            :                         put_futex_key(&key1);
    2186                 :            :                         /*
    2187                 :            :                          * Handle the case where the owner is in the middle of
    2188                 :            :                          * exiting. Wait for the exit to complete otherwise
    2189                 :            :                          * this task might loop forever, aka. live lock.
    2190                 :            :                          */
    2191                 :          0 :                         wait_for_owner_exiting(ret, exiting);
    2192                 :          0 :                         cond_resched();
    2193                 :          0 :                         goto retry;
    2194                 :            :                 default:
    2195                 :          0 :                         goto out_unlock;
    2196                 :            :                 }
    2197                 :            :         }
    2198                 :            : 
    2199                 :          0 :         plist_for_each_entry_safe(this, next, &hb1->chain, list) {
    2200                 :          0 :                 if (task_count - nr_wake >= nr_requeue)
    2201                 :            :                         break;
    2202                 :            : 
    2203                 :          0 :                 if (!match_futex(&this->key, &key1))
    2204                 :          0 :                         continue;
    2205                 :            : 
    2206                 :            :                 /*
    2207                 :            :                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
    2208                 :            :                  * be paired with each other and no other futex ops.
    2209                 :            :                  *
    2210                 :            :                  * We should never be requeueing a futex_q with a pi_state,
    2211                 :            :                  * which is awaiting a futex_unlock_pi().
    2212                 :            :                  */
    2213                 :          0 :                 if ((requeue_pi && !this->rt_waiter) ||
    2214                 :          0 :                     (!requeue_pi && this->rt_waiter) ||
    2215                 :          0 :                     this->pi_state) {
    2216                 :            :                         ret = -EINVAL;
    2217                 :            :                         break;
    2218                 :            :                 }
    2219                 :            : 
    2220                 :            :                 /*
    2221                 :            :                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
    2222                 :            :                  * lock, we already woke the top_waiter.  If not, it will be
    2223                 :            :                  * woken by futex_unlock_pi().
    2224                 :            :                  */
    2225                 :          0 :                 if (++task_count <= nr_wake && !requeue_pi) {
    2226                 :          0 :                         mark_wake_futex(&wake_q, this);
    2227                 :          0 :                         continue;
    2228                 :            :                 }
    2229                 :            : 
    2230                 :            :                 /* Ensure we requeue to the expected futex for requeue_pi. */
    2231                 :          0 :                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
    2232                 :            :                         ret = -EINVAL;
    2233                 :            :                         break;
    2234                 :            :                 }
    2235                 :            : 
    2236                 :            :                 /*
    2237                 :            :                  * Requeue nr_requeue waiters and possibly one more in the case
    2238                 :            :                  * of requeue_pi if we couldn't acquire the lock atomically.
    2239                 :            :                  */
    2240                 :          0 :                 if (requeue_pi) {
    2241                 :            :                         /*
    2242                 :            :                          * Prepare the waiter to take the rt_mutex. Take a
    2243                 :            :                          * refcount on the pi_state and store the pointer in
    2244                 :            :                          * the futex_q object of the waiter.
    2245                 :            :                          */
    2246                 :          0 :                         get_pi_state(pi_state);
    2247                 :          0 :                         this->pi_state = pi_state;
    2248                 :          0 :                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
    2249                 :            :                                                         this->rt_waiter,
    2250                 :            :                                                         this->task);
    2251                 :          0 :                         if (ret == 1) {
    2252                 :            :                                 /*
    2253                 :            :                                  * We got the lock. We do neither drop the
    2254                 :            :                                  * refcount on pi_state nor clear
    2255                 :            :                                  * this->pi_state because the waiter needs the
    2256                 :            :                                  * pi_state for cleaning up the user space
    2257                 :            :                                  * value. It will drop the refcount after
    2258                 :            :                                  * doing so.
    2259                 :            :                                  */
    2260                 :          0 :                                 requeue_pi_wake_futex(this, &key2, hb2);
    2261                 :          0 :                                 drop_count++;
    2262                 :          0 :                                 continue;
    2263                 :          0 :                         } else if (ret) {
    2264                 :            :                                 /*
    2265                 :            :                                  * rt_mutex_start_proxy_lock() detected a
    2266                 :            :                                  * potential deadlock when we tried to queue
    2267                 :            :                                  * that waiter. Drop the pi_state reference
    2268                 :            :                                  * which we took above and remove the pointer
    2269                 :            :                                  * to the state from the waiters futex_q
    2270                 :            :                                  * object.
    2271                 :            :                                  */
    2272                 :          0 :                                 this->pi_state = NULL;
    2273                 :          0 :                                 put_pi_state(pi_state);
    2274                 :            :                                 /*
    2275                 :            :                                  * We stop queueing more waiters and let user
    2276                 :            :                                  * space deal with the mess.
    2277                 :            :                                  */
    2278                 :          0 :                                 break;
    2279                 :            :                         }
    2280                 :            :                 }
    2281                 :          0 :                 requeue_futex(this, hb1, hb2, &key2);
    2282                 :          0 :                 drop_count++;
    2283                 :            :         }
    2284                 :            : 
    2285                 :            :         /*
    2286                 :            :          * We took an extra initial reference to the pi_state either
    2287                 :            :          * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
    2288                 :            :          * need to drop it here again.
    2289                 :            :          */
    2290                 :          0 :         put_pi_state(pi_state);
    2291                 :            : 
    2292                 :            : out_unlock:
    2293                 :            :         double_unlock_hb(hb1, hb2);
    2294                 :          0 :         wake_up_q(&wake_q);
    2295                 :            :         hb_waiters_dec(hb2);
    2296                 :            : 
    2297                 :            :         /*
    2298                 :            :          * drop_futex_key_refs() must be called outside the spinlocks. During
    2299                 :            :          * the requeue we moved futex_q's from the hash bucket at key1 to the
    2300                 :            :          * one at key2 and updated their key pointer.  We no longer need to
    2301                 :            :          * hold the references to key1.
    2302                 :            :          */
    2303                 :          0 :         while (--drop_count >= 0)
    2304                 :          0 :                 drop_futex_key_refs(&key1);
    2305                 :            : 
    2306                 :            : out_put_keys:
    2307                 :            :         put_futex_key(&key2);
    2308                 :            : out_put_key1:
    2309                 :            :         put_futex_key(&key1);
    2310                 :            : out:
    2311                 :          0 :         return ret ? ret : task_count;
    2312                 :            : }
    2313                 :            : 
    2314                 :            : /* The key must be already stored in q->key. */
    2315                 :          3 : static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
    2316                 :            :         __acquires(&hb->lock)
    2317                 :            : {
    2318                 :            :         struct futex_hash_bucket *hb;
    2319                 :            : 
    2320                 :          3 :         hb = hash_futex(&q->key);
    2321                 :            : 
    2322                 :            :         /*
    2323                 :            :          * Increment the counter before taking the lock so that
    2324                 :            :          * a potential waker won't miss a to-be-slept task that is
    2325                 :            :          * waiting for the spinlock. This is safe as all queue_lock()
    2326                 :            :          * users end up calling queue_me(). Similarly, for housekeeping,
    2327                 :            :          * decrement the counter at queue_unlock() when some error has
    2328                 :            :          * occurred and we don't end up adding the task to the list.
    2329                 :            :          */
    2330                 :          3 :         hb_waiters_inc(hb); /* implies smp_mb(); (A) */
    2331                 :            : 
    2332                 :          3 :         q->lock_ptr = &hb->lock;
    2333                 :            : 
    2334                 :            :         spin_lock(&hb->lock);
    2335                 :          3 :         return hb;
    2336                 :            : }
    2337                 :            : 
    2338                 :            : static inline void
    2339                 :          3 : queue_unlock(struct futex_hash_bucket *hb)
    2340                 :            :         __releases(&hb->lock)
    2341                 :            : {
    2342                 :            :         spin_unlock(&hb->lock);
    2343                 :            :         hb_waiters_dec(hb);
    2344                 :          3 : }
    2345                 :            : 
    2346                 :          3 : static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
    2347                 :            : {
    2348                 :            :         int prio;
    2349                 :            : 
    2350                 :            :         /*
    2351                 :            :          * The priority used to register this element is
    2352                 :            :          * - either the real thread-priority for the real-time threads
    2353                 :            :          * (i.e. threads with a priority lower than MAX_RT_PRIO)
    2354                 :            :          * - or MAX_RT_PRIO for non-RT threads.
    2355                 :            :          * Thus, all RT-threads are woken first in priority order, and
    2356                 :            :          * the others are woken last, in FIFO order.
    2357                 :            :          */
    2358                 :          3 :         prio = min(current->normal_prio, MAX_RT_PRIO);
    2359                 :            : 
    2360                 :            :         plist_node_init(&q->list, prio);
    2361                 :          3 :         plist_add(&q->list, &hb->chain);
    2362                 :          3 :         q->task = current;
    2363                 :          3 : }
    2364                 :            : 
    2365                 :            : /**
    2366                 :            :  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
    2367                 :            :  * @q:  The futex_q to enqueue
    2368                 :            :  * @hb: The destination hash bucket
    2369                 :            :  *
    2370                 :            :  * The hb->lock must be held by the caller, and is released here. A call to
    2371                 :            :  * queue_me() is typically paired with exactly one call to unqueue_me().  The
    2372                 :            :  * exceptions involve the PI related operations, which may use unqueue_me_pi()
    2373                 :            :  * or nothing if the unqueue is done as part of the wake process and the unqueue
    2374                 :            :  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
    2375                 :            :  * an example).
    2376                 :            :  */
    2377                 :            : static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
    2378                 :            :         __releases(&hb->lock)
    2379                 :            : {
    2380                 :          3 :         __queue_me(q, hb);
    2381                 :            :         spin_unlock(&hb->lock);
    2382                 :            : }
    2383                 :            : 
    2384                 :            : /**
    2385                 :            :  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
    2386                 :            :  * @q:  The futex_q to unqueue
    2387                 :            :  *
    2388                 :            :  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
    2389                 :            :  * be paired with exactly one earlier call to queue_me().
    2390                 :            :  *
    2391                 :            :  * Return:
    2392                 :            :  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
    2393                 :            :  *  - 0 - if the futex_q was already removed by the waking thread
    2394                 :            :  */
    2395                 :          3 : static int unqueue_me(struct futex_q *q)
    2396                 :            : {
    2397                 :            :         spinlock_t *lock_ptr;
    2398                 :            :         int ret = 0;
    2399                 :            : 
    2400                 :            :         /* In the common case we don't take the spinlock, which is nice. */
    2401                 :            : retry:
    2402                 :            :         /*
    2403                 :            :          * q->lock_ptr can change between this read and the following spin_lock.
    2404                 :            :          * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
    2405                 :            :          * optimizing lock_ptr out of the logic below.
    2406                 :            :          */
    2407                 :          3 :         lock_ptr = READ_ONCE(q->lock_ptr);
    2408                 :          3 :         if (lock_ptr != NULL) {
    2409                 :            :                 spin_lock(lock_ptr);
    2410                 :            :                 /*
    2411                 :            :                  * q->lock_ptr can change between reading it and
    2412                 :            :                  * spin_lock(), causing us to take the wrong lock.  This
    2413                 :            :                  * corrects the race condition.
    2414                 :            :                  *
    2415                 :            :                  * Reasoning goes like this: if we have the wrong lock,
    2416                 :            :                  * q->lock_ptr must have changed (maybe several times)
    2417                 :            :                  * between reading it and the spin_lock().  It can
    2418                 :            :                  * change again after the spin_lock() but only if it was
    2419                 :            :                  * already changed before the spin_lock().  It cannot,
    2420                 :            :                  * however, change back to the original value.  Therefore
    2421                 :            :                  * we can detect whether we acquired the correct lock.
    2422                 :            :                  */
    2423                 :          3 :                 if (unlikely(lock_ptr != q->lock_ptr)) {
    2424                 :            :                         spin_unlock(lock_ptr);
    2425                 :            :                         goto retry;
    2426                 :            :                 }
    2427                 :          3 :                 __unqueue_futex(q);
    2428                 :            : 
    2429                 :          3 :                 BUG_ON(q->pi_state);
    2430                 :            : 
    2431                 :            :                 spin_unlock(lock_ptr);
    2432                 :            :                 ret = 1;
    2433                 :            :         }
    2434                 :            : 
    2435                 :          3 :         drop_futex_key_refs(&q->key);
    2436                 :          3 :         return ret;
    2437                 :            : }
    2438                 :            : 
    2439                 :            : /*
    2440                 :            :  * PI futexes can not be requeued and must remove themself from the
    2441                 :            :  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
    2442                 :            :  * and dropped here.
    2443                 :            :  */
    2444                 :          0 : static void unqueue_me_pi(struct futex_q *q)
    2445                 :            :         __releases(q->lock_ptr)
    2446                 :            : {
    2447                 :          0 :         __unqueue_futex(q);
    2448                 :            : 
    2449                 :          0 :         BUG_ON(!q->pi_state);
    2450                 :          0 :         put_pi_state(q->pi_state);
    2451                 :          0 :         q->pi_state = NULL;
    2452                 :            : 
    2453                 :          0 :         spin_unlock(q->lock_ptr);
    2454                 :          0 : }
    2455                 :            : 
    2456                 :          0 : static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
    2457                 :            :                                 struct task_struct *argowner)
    2458                 :            : {
    2459                 :          0 :         struct futex_pi_state *pi_state = q->pi_state;
    2460                 :            :         u32 uval, uninitialized_var(curval), newval;
    2461                 :            :         struct task_struct *oldowner, *newowner;
    2462                 :            :         u32 newtid;
    2463                 :            :         int ret, err = 0;
    2464                 :            : 
    2465                 :            :         lockdep_assert_held(q->lock_ptr);
    2466                 :            : 
    2467                 :          0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    2468                 :            : 
    2469                 :          0 :         oldowner = pi_state->owner;
    2470                 :            : 
    2471                 :            :         /*
    2472                 :            :          * We are here because either:
    2473                 :            :          *
    2474                 :            :          *  - we stole the lock and pi_state->owner needs updating to reflect
    2475                 :            :          *    that (@argowner == current),
    2476                 :            :          *
    2477                 :            :          * or:
    2478                 :            :          *
    2479                 :            :          *  - someone stole our lock and we need to fix things to point to the
    2480                 :            :          *    new owner (@argowner == NULL).
    2481                 :            :          *
    2482                 :            :          * Either way, we have to replace the TID in the user space variable.
    2483                 :            :          * This must be atomic as we have to preserve the owner died bit here.
    2484                 :            :          *
    2485                 :            :          * Note: We write the user space value _before_ changing the pi_state
    2486                 :            :          * because we can fault here. Imagine swapped out pages or a fork
    2487                 :            :          * that marked all the anonymous memory readonly for cow.
    2488                 :            :          *
    2489                 :            :          * Modifying pi_state _before_ the user space value would leave the
    2490                 :            :          * pi_state in an inconsistent state when we fault here, because we
    2491                 :            :          * need to drop the locks to handle the fault. This might be observed
    2492                 :            :          * in the PID check in lookup_pi_state.
    2493                 :            :          */
    2494                 :            : retry:
    2495                 :          0 :         if (!argowner) {
    2496                 :          0 :                 if (oldowner != current) {
    2497                 :            :                         /*
    2498                 :            :                          * We raced against a concurrent self; things are
    2499                 :            :                          * already fixed up. Nothing to do.
    2500                 :            :                          */
    2501                 :            :                         ret = 0;
    2502                 :            :                         goto out_unlock;
    2503                 :            :                 }
    2504                 :            : 
    2505                 :          0 :                 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
    2506                 :            :                         /* We got the lock after all, nothing to fix. */
    2507                 :            :                         ret = 0;
    2508                 :            :                         goto out_unlock;
    2509                 :            :                 }
    2510                 :            : 
    2511                 :            :                 /*
    2512                 :            :                  * Since we just failed the trylock; there must be an owner.
    2513                 :            :                  */
    2514                 :            :                 newowner = rt_mutex_owner(&pi_state->pi_mutex);
    2515                 :          0 :                 BUG_ON(!newowner);
    2516                 :            :         } else {
    2517                 :          0 :                 WARN_ON_ONCE(argowner != current);
    2518                 :          0 :                 if (oldowner == current) {
    2519                 :            :                         /*
    2520                 :            :                          * We raced against a concurrent self; things are
    2521                 :            :                          * already fixed up. Nothing to do.
    2522                 :            :                          */
    2523                 :            :                         ret = 0;
    2524                 :            :                         goto out_unlock;
    2525                 :            :                 }
    2526                 :            :                 newowner = argowner;
    2527                 :            :         }
    2528                 :            : 
    2529                 :          0 :         newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
    2530                 :            :         /* Owner died? */
    2531                 :          0 :         if (!pi_state->owner)
    2532                 :          0 :                 newtid |= FUTEX_OWNER_DIED;
    2533                 :            : 
    2534                 :          0 :         err = get_futex_value_locked(&uval, uaddr);
    2535                 :          0 :         if (err)
    2536                 :            :                 goto handle_err;
    2537                 :            : 
    2538                 :            :         for (;;) {
    2539                 :          0 :                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
    2540                 :            : 
    2541                 :          0 :                 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
    2542                 :          0 :                 if (err)
    2543                 :            :                         goto handle_err;
    2544                 :            : 
    2545                 :          0 :                 if (curval == uval)
    2546                 :            :                         break;
    2547                 :          0 :                 uval = curval;
    2548                 :          0 :         }
    2549                 :            : 
    2550                 :            :         /*
    2551                 :            :          * We fixed up user space. Now we need to fix the pi_state
    2552                 :            :          * itself.
    2553                 :            :          */
    2554                 :          0 :         if (pi_state->owner != NULL) {
    2555                 :          0 :                 raw_spin_lock(&pi_state->owner->pi_lock);
    2556                 :          0 :                 WARN_ON(list_empty(&pi_state->list));
    2557                 :            :                 list_del_init(&pi_state->list);
    2558                 :          0 :                 raw_spin_unlock(&pi_state->owner->pi_lock);
    2559                 :            :         }
    2560                 :            : 
    2561                 :          0 :         pi_state->owner = newowner;
    2562                 :            : 
    2563                 :          0 :         raw_spin_lock(&newowner->pi_lock);
    2564                 :          0 :         WARN_ON(!list_empty(&pi_state->list));
    2565                 :          0 :         list_add(&pi_state->list, &newowner->pi_state_list);
    2566                 :            :         raw_spin_unlock(&newowner->pi_lock);
    2567                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    2568                 :            : 
    2569                 :          0 :         return 0;
    2570                 :            : 
    2571                 :            :         /*
    2572                 :            :          * In order to reschedule or handle a page fault, we need to drop the
    2573                 :            :          * locks here. In the case of a fault, this gives the other task
    2574                 :            :          * (either the highest priority waiter itself or the task which stole
    2575                 :            :          * the rtmutex) the chance to try the fixup of the pi_state. So once we
    2576                 :            :          * are back from handling the fault we need to check the pi_state after
    2577                 :            :          * reacquiring the locks and before trying to do another fixup. When
    2578                 :            :          * the fixup has been done already we simply return.
    2579                 :            :          *
    2580                 :            :          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
    2581                 :            :          * drop hb->lock since the caller owns the hb -> futex_q relation.
    2582                 :            :          * Dropping the pi_mutex->wait_lock requires the state revalidate.
    2583                 :            :          */
    2584                 :            : handle_err:
    2585                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    2586                 :          0 :         spin_unlock(q->lock_ptr);
    2587                 :            : 
    2588                 :          0 :         switch (err) {
    2589                 :            :         case -EFAULT:
    2590                 :          0 :                 ret = fault_in_user_writeable(uaddr);
    2591                 :          0 :                 break;
    2592                 :            : 
    2593                 :            :         case -EAGAIN:
    2594                 :          0 :                 cond_resched();
    2595                 :            :                 ret = 0;
    2596                 :          0 :                 break;
    2597                 :            : 
    2598                 :            :         default:
    2599                 :          0 :                 WARN_ON_ONCE(1);
    2600                 :            :                 ret = err;
    2601                 :          0 :                 break;
    2602                 :            :         }
    2603                 :            : 
    2604                 :          0 :         spin_lock(q->lock_ptr);
    2605                 :          0 :         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    2606                 :            : 
    2607                 :            :         /*
    2608                 :            :          * Check if someone else fixed it for us:
    2609                 :            :          */
    2610                 :          0 :         if (pi_state->owner != oldowner) {
    2611                 :            :                 ret = 0;
    2612                 :            :                 goto out_unlock;
    2613                 :            :         }
    2614                 :            : 
    2615                 :          0 :         if (ret)
    2616                 :            :                 goto out_unlock;
    2617                 :            : 
    2618                 :            :         goto retry;
    2619                 :            : 
    2620                 :            : out_unlock:
    2621                 :          0 :         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
    2622                 :          0 :         return ret;
    2623                 :            : }
    2624                 :            : 
    2625                 :            : static long futex_wait_restart(struct restart_block *restart);
    2626                 :            : 
    2627                 :            : /**
    2628                 :            :  * fixup_owner() - Post lock pi_state and corner case management
    2629                 :            :  * @uaddr:      user address of the futex
    2630                 :            :  * @q:          futex_q (contains pi_state and access to the rt_mutex)
    2631                 :            :  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
    2632                 :            :  *
    2633                 :            :  * After attempting to lock an rt_mutex, this function is called to cleanup
    2634                 :            :  * the pi_state owner as well as handle race conditions that may allow us to
    2635                 :            :  * acquire the lock. Must be called with the hb lock held.
    2636                 :            :  *
    2637                 :            :  * Return:
    2638                 :            :  *  -  1 - success, lock taken;
    2639                 :            :  *  -  0 - success, lock not taken;
    2640                 :            :  *  - <0 - on error (-EFAULT)
    2641                 :            :  */
    2642                 :          0 : static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
    2643                 :            : {
    2644                 :            :         int ret = 0;
    2645                 :            : 
    2646                 :          0 :         if (locked) {
    2647                 :            :                 /*
    2648                 :            :                  * Got the lock. We might not be the anticipated owner if we
    2649                 :            :                  * did a lock-steal - fix up the PI-state in that case:
    2650                 :            :                  *
    2651                 :            :                  * Speculative pi_state->owner read (we don't hold wait_lock);
    2652                 :            :                  * since we own the lock pi_state->owner == current is the
    2653                 :            :                  * stable state, anything else needs more attention.
    2654                 :            :                  */
    2655                 :          0 :                 if (q->pi_state->owner != current)
    2656                 :          0 :                         ret = fixup_pi_state_owner(uaddr, q, current);
    2657                 :            :                 goto out;
    2658                 :            :         }
    2659                 :            : 
    2660                 :            :         /*
    2661                 :            :          * If we didn't get the lock; check if anybody stole it from us. In
    2662                 :            :          * that case, we need to fix up the uval to point to them instead of
    2663                 :            :          * us, otherwise bad things happen. [10]
    2664                 :            :          *
    2665                 :            :          * Another speculative read; pi_state->owner == current is unstable
    2666                 :            :          * but needs our attention.
    2667                 :            :          */
    2668                 :          0 :         if (q->pi_state->owner == current) {
    2669                 :          0 :                 ret = fixup_pi_state_owner(uaddr, q, NULL);
    2670                 :          0 :                 goto out;
    2671                 :            :         }
    2672                 :            : 
    2673                 :            :         /*
    2674                 :            :          * Paranoia check. If we did not take the lock, then we should not be
    2675                 :            :          * the owner of the rt_mutex.
    2676                 :            :          */
    2677                 :          0 :         if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
    2678                 :          0 :                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
    2679                 :            :                                 "pi-state %p\n", ret,
    2680                 :            :                                 q->pi_state->pi_mutex.owner,
    2681                 :            :                                 q->pi_state->owner);
    2682                 :            :         }
    2683                 :            : 
    2684                 :            : out:
    2685                 :          0 :         return ret ? ret : locked;
    2686                 :            : }
    2687                 :            : 
    2688                 :            : /**
    2689                 :            :  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
    2690                 :            :  * @hb:         the futex hash bucket, must be locked by the caller
    2691                 :            :  * @q:          the futex_q to queue up on
    2692                 :            :  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
    2693                 :            :  */
    2694                 :          3 : static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
    2695                 :            :                                 struct hrtimer_sleeper *timeout)
    2696                 :            : {
    2697                 :            :         /*
    2698                 :            :          * The task state is guaranteed to be set before another task can
    2699                 :            :          * wake it. set_current_state() is implemented using smp_store_mb() and
    2700                 :            :          * queue_me() calls spin_unlock() upon completion, both serializing
    2701                 :            :          * access to the hash list and forcing another memory barrier.
    2702                 :            :          */
    2703                 :          3 :         set_current_state(TASK_INTERRUPTIBLE);
    2704                 :            :         queue_me(q, hb);
    2705                 :            : 
    2706                 :            :         /* Arm the timer */
    2707                 :          3 :         if (timeout)
    2708                 :          3 :                 hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
    2709                 :            : 
    2710                 :            :         /*
    2711                 :            :          * If we have been removed from the hash list, then another task
    2712                 :            :          * has tried to wake us, and we can skip the call to schedule().
    2713                 :            :          */
    2714                 :          3 :         if (likely(!plist_node_empty(&q->list))) {
    2715                 :            :                 /*
    2716                 :            :                  * If the timer has already expired, current will already be
    2717                 :            :                  * flagged for rescheduling. Only call schedule if there
    2718                 :            :                  * is no timeout, or if it has yet to expire.
    2719                 :            :                  */
    2720                 :          3 :                 if (!timeout || timeout->task)
    2721                 :          3 :                         freezable_schedule();
    2722                 :            :         }
    2723                 :          3 :         __set_current_state(TASK_RUNNING);
    2724                 :          3 : }
    2725                 :            : 
    2726                 :            : /**
    2727                 :            :  * futex_wait_setup() - Prepare to wait on a futex
    2728                 :            :  * @uaddr:      the futex userspace address
    2729                 :            :  * @val:        the expected value
    2730                 :            :  * @flags:      futex flags (FLAGS_SHARED, etc.)
    2731                 :            :  * @q:          the associated futex_q
    2732                 :            :  * @hb:         storage for hash_bucket pointer to be returned to caller
    2733                 :            :  *
    2734                 :            :  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
    2735                 :            :  * compare it with the expected value.  Handle atomic faults internally.
    2736                 :            :  * Return with the hb lock held and a q.key reference on success, and unlocked
    2737                 :            :  * with no q.key reference on failure.
    2738                 :            :  *
    2739                 :            :  * Return:
    2740                 :            :  *  -  0 - uaddr contains val and hb has been locked;
    2741                 :            :  *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
    2742                 :            :  */
    2743                 :          3 : static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
    2744                 :            :                            struct futex_q *q, struct futex_hash_bucket **hb)
    2745                 :            : {
    2746                 :            :         u32 uval;
    2747                 :            :         int ret;
    2748                 :            : 
    2749                 :            :         /*
    2750                 :            :          * Access the page AFTER the hash-bucket is locked.
    2751                 :            :          * Order is important:
    2752                 :            :          *
    2753                 :            :          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
    2754                 :            :          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
    2755                 :            :          *
    2756                 :            :          * The basic logical guarantee of a futex is that it blocks ONLY
    2757                 :            :          * if cond(var) is known to be true at the time of blocking, for
    2758                 :            :          * any cond.  If we locked the hash-bucket after testing *uaddr, that
    2759                 :            :          * would open a race condition where we could block indefinitely with
    2760                 :            :          * cond(var) false, which would violate the guarantee.
    2761                 :            :          *
    2762                 :            :          * On the other hand, we insert q and release the hash-bucket only
    2763                 :            :          * after testing *uaddr.  This guarantees that futex_wait() will NOT
    2764                 :            :          * absorb a wakeup if *uaddr does not match the desired values
    2765                 :            :          * while the syscall executes.
    2766                 :            :          */
    2767                 :            : retry:
    2768                 :          3 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
    2769                 :          3 :         if (unlikely(ret != 0))
    2770                 :          0 :                 return ret;
    2771                 :            : 
    2772                 :            : retry_private:
    2773                 :          3 :         *hb = queue_lock(q);
    2774                 :            : 
    2775                 :          3 :         ret = get_futex_value_locked(&uval, uaddr);
    2776                 :            : 
    2777                 :          3 :         if (ret) {
    2778                 :          0 :                 queue_unlock(*hb);
    2779                 :            : 
    2780                 :          0 :                 ret = get_user(uval, uaddr);
    2781                 :          0 :                 if (ret)
    2782                 :            :                         goto out;
    2783                 :            : 
    2784                 :          0 :                 if (!(flags & FLAGS_SHARED))
    2785                 :            :                         goto retry_private;
    2786                 :            : 
    2787                 :            :                 put_futex_key(&q->key);
    2788                 :            :                 goto retry;
    2789                 :            :         }
    2790                 :            : 
    2791                 :          3 :         if (uval != val) {
    2792                 :          3 :                 queue_unlock(*hb);
    2793                 :            :                 ret = -EWOULDBLOCK;
    2794                 :            :         }
    2795                 :            : 
    2796                 :            : out:
    2797                 :          3 :         if (ret)
    2798                 :          3 :                 put_futex_key(&q->key);
    2799                 :          3 :         return ret;
    2800                 :            : }
    2801                 :            : 
    2802                 :          3 : static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
    2803                 :            :                       ktime_t *abs_time, u32 bitset)
    2804                 :            : {
    2805                 :            :         struct hrtimer_sleeper timeout, *to;
    2806                 :            :         struct restart_block *restart;
    2807                 :            :         struct futex_hash_bucket *hb;
    2808                 :          3 :         struct futex_q q = futex_q_init;
    2809                 :            :         int ret;
    2810                 :            : 
    2811                 :          3 :         if (!bitset)
    2812                 :            :                 return -EINVAL;
    2813                 :          3 :         q.bitset = bitset;
    2814                 :            : 
    2815                 :          3 :         to = futex_setup_timer(abs_time, &timeout, flags,
    2816                 :          3 :                                current->timer_slack_ns);
    2817                 :            : retry:
    2818                 :            :         /*
    2819                 :            :          * Prepare to wait on uaddr. On success, holds hb lock and increments
    2820                 :            :          * q.key refs.
    2821                 :            :          */
    2822                 :          3 :         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
    2823                 :          3 :         if (ret)
    2824                 :            :                 goto out;
    2825                 :            : 
    2826                 :            :         /* queue_me and wait for wakeup, timeout, or a signal. */
    2827                 :          3 :         futex_wait_queue_me(hb, &q, to);
    2828                 :            : 
    2829                 :            :         /* If we were woken (and unqueued), we succeeded, whatever. */
    2830                 :            :         ret = 0;
    2831                 :            :         /* unqueue_me() drops q.key ref */
    2832                 :          3 :         if (!unqueue_me(&q))
    2833                 :            :                 goto out;
    2834                 :            :         ret = -ETIMEDOUT;
    2835                 :          3 :         if (to && !to->task)
    2836                 :            :                 goto out;
    2837                 :            : 
    2838                 :            :         /*
    2839                 :            :          * We expect signal_pending(current), but we might be the
    2840                 :            :          * victim of a spurious wakeup as well.
    2841                 :            :          */
    2842                 :          3 :         if (!signal_pending(current))
    2843                 :            :                 goto retry;
    2844                 :            : 
    2845                 :            :         ret = -ERESTARTSYS;
    2846                 :          3 :         if (!abs_time)
    2847                 :            :                 goto out;
    2848                 :            : 
    2849                 :          3 :         restart = &current->restart_block;
    2850                 :          3 :         restart->fn = futex_wait_restart;
    2851                 :          3 :         restart->futex.uaddr = uaddr;
    2852                 :          3 :         restart->futex.val = val;
    2853                 :          3 :         restart->futex.time = *abs_time;
    2854                 :          3 :         restart->futex.bitset = bitset;
    2855                 :          3 :         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
    2856                 :            : 
    2857                 :            :         ret = -ERESTART_RESTARTBLOCK;
    2858                 :            : 
    2859                 :            : out:
    2860                 :          3 :         if (to) {
    2861                 :          3 :                 hrtimer_cancel(&to->timer);
    2862                 :            :                 destroy_hrtimer_on_stack(&to->timer);
    2863                 :            :         }
    2864                 :          3 :         return ret;
    2865                 :            : }
    2866                 :            : 
    2867                 :            : 
    2868                 :          0 : static long futex_wait_restart(struct restart_block *restart)
    2869                 :            : {
    2870                 :          0 :         u32 __user *uaddr = restart->futex.uaddr;
    2871                 :            :         ktime_t t, *tp = NULL;
    2872                 :            : 
    2873                 :          0 :         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
    2874                 :          0 :                 t = restart->futex.time;
    2875                 :            :                 tp = &t;
    2876                 :            :         }
    2877                 :          0 :         restart->fn = do_no_restart_syscall;
    2878                 :            : 
    2879                 :          0 :         return (long)futex_wait(uaddr, restart->futex.flags,
    2880                 :            :                                 restart->futex.val, tp, restart->futex.bitset);
    2881                 :            : }
    2882                 :            : 
    2883                 :            : 
    2884                 :            : /*
    2885                 :            :  * Userspace tried a 0 -> TID atomic transition of the futex value
    2886                 :            :  * and failed. The kernel side here does the whole locking operation:
    2887                 :            :  * if there are waiters then it will block as a consequence of relying
    2888                 :            :  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
    2889                 :            :  * a 0 value of the futex too.).
    2890                 :            :  *
    2891                 :            :  * Also serves as futex trylock_pi()'ing, and due semantics.
    2892                 :            :  */
    2893                 :          0 : static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
    2894                 :            :                          ktime_t *time, int trylock)
    2895                 :            : {
    2896                 :            :         struct hrtimer_sleeper timeout, *to;
    2897                 :            :         struct futex_pi_state *pi_state = NULL;
    2898                 :          0 :         struct task_struct *exiting = NULL;
    2899                 :            :         struct rt_mutex_waiter rt_waiter;
    2900                 :            :         struct futex_hash_bucket *hb;
    2901                 :          0 :         struct futex_q q = futex_q_init;
    2902                 :            :         int res, ret;
    2903                 :            : 
    2904                 :            :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    2905                 :            :                 return -ENOSYS;
    2906                 :            : 
    2907                 :          0 :         if (refill_pi_state_cache())
    2908                 :            :                 return -ENOMEM;
    2909                 :            : 
    2910                 :          0 :         to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
    2911                 :            : 
    2912                 :            : retry:
    2913                 :          0 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
    2914                 :          0 :         if (unlikely(ret != 0))
    2915                 :            :                 goto out;
    2916                 :            : 
    2917                 :            : retry_private:
    2918                 :          0 :         hb = queue_lock(&q);
    2919                 :            : 
    2920                 :          0 :         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
    2921                 :            :                                    &exiting, 0);
    2922                 :          0 :         if (unlikely(ret)) {
    2923                 :            :                 /*
    2924                 :            :                  * Atomic work succeeded and we got the lock,
    2925                 :            :                  * or failed. Either way, we do _not_ block.
    2926                 :            :                  */
    2927                 :          0 :                 switch (ret) {
    2928                 :            :                 case 1:
    2929                 :            :                         /* We got the lock. */
    2930                 :            :                         ret = 0;
    2931                 :          0 :                         goto out_unlock_put_key;
    2932                 :            :                 case -EFAULT:
    2933                 :            :                         goto uaddr_faulted;
    2934                 :            :                 case -EBUSY:
    2935                 :            :                 case -EAGAIN:
    2936                 :            :                         /*
    2937                 :            :                          * Two reasons for this:
    2938                 :            :                          * - EBUSY: Task is exiting and we just wait for the
    2939                 :            :                          *   exit to complete.
    2940                 :            :                          * - EAGAIN: The user space value changed.
    2941                 :            :                          */
    2942                 :          0 :                         queue_unlock(hb);
    2943                 :            :                         put_futex_key(&q.key);
    2944                 :            :                         /*
    2945                 :            :                          * Handle the case where the owner is in the middle of
    2946                 :            :                          * exiting. Wait for the exit to complete otherwise
    2947                 :            :                          * this task might loop forever, aka. live lock.
    2948                 :            :                          */
    2949                 :          0 :                         wait_for_owner_exiting(ret, exiting);
    2950                 :          0 :                         cond_resched();
    2951                 :          0 :                         goto retry;
    2952                 :            :                 default:
    2953                 :            :                         goto out_unlock_put_key;
    2954                 :            :                 }
    2955                 :            :         }
    2956                 :            : 
    2957                 :          0 :         WARN_ON(!q.pi_state);
    2958                 :            : 
    2959                 :            :         /*
    2960                 :            :          * Only actually queue now that the atomic ops are done:
    2961                 :            :          */
    2962                 :          0 :         __queue_me(&q, hb);
    2963                 :            : 
    2964                 :          0 :         if (trylock) {
    2965                 :          0 :                 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
    2966                 :            :                 /* Fixup the trylock return value: */
    2967                 :          0 :                 ret = ret ? 0 : -EWOULDBLOCK;
    2968                 :          0 :                 goto no_block;
    2969                 :            :         }
    2970                 :            : 
    2971                 :          0 :         rt_mutex_init_waiter(&rt_waiter);
    2972                 :            : 
    2973                 :            :         /*
    2974                 :            :          * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
    2975                 :            :          * hold it while doing rt_mutex_start_proxy(), because then it will
    2976                 :            :          * include hb->lock in the blocking chain, even through we'll not in
    2977                 :            :          * fact hold it while blocking. This will lead it to report -EDEADLK
    2978                 :            :          * and BUG when futex_unlock_pi() interleaves with this.
    2979                 :            :          *
    2980                 :            :          * Therefore acquire wait_lock while holding hb->lock, but drop the
    2981                 :            :          * latter before calling __rt_mutex_start_proxy_lock(). This
    2982                 :            :          * interleaves with futex_unlock_pi() -- which does a similar lock
    2983                 :            :          * handoff -- such that the latter can observe the futex_q::pi_state
    2984                 :            :          * before __rt_mutex_start_proxy_lock() is done.
    2985                 :            :          */
    2986                 :          0 :         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
    2987                 :          0 :         spin_unlock(q.lock_ptr);
    2988                 :            :         /*
    2989                 :            :          * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
    2990                 :            :          * such that futex_unlock_pi() is guaranteed to observe the waiter when
    2991                 :            :          * it sees the futex_q::pi_state.
    2992                 :            :          */
    2993                 :          0 :         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
    2994                 :          0 :         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
    2995                 :            : 
    2996                 :          0 :         if (ret) {
    2997                 :          0 :                 if (ret == 1)
    2998                 :            :                         ret = 0;
    2999                 :            :                 goto cleanup;
    3000                 :            :         }
    3001                 :            : 
    3002                 :          0 :         if (unlikely(to))
    3003                 :          0 :                 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
    3004                 :            : 
    3005                 :          0 :         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
    3006                 :            : 
    3007                 :            : cleanup:
    3008                 :          0 :         spin_lock(q.lock_ptr);
    3009                 :            :         /*
    3010                 :            :          * If we failed to acquire the lock (deadlock/signal/timeout), we must
    3011                 :            :          * first acquire the hb->lock before removing the lock from the
    3012                 :            :          * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
    3013                 :            :          * lists consistent.
    3014                 :            :          *
    3015                 :            :          * In particular; it is important that futex_unlock_pi() can not
    3016                 :            :          * observe this inconsistency.
    3017                 :            :          */
    3018                 :          0 :         if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
    3019                 :            :                 ret = 0;
    3020                 :            : 
    3021                 :            : no_block:
    3022                 :            :         /*
    3023                 :            :          * Fixup the pi_state owner and possibly acquire the lock if we
    3024                 :            :          * haven't already.
    3025                 :            :          */
    3026                 :          0 :         res = fixup_owner(uaddr, &q, !ret);
    3027                 :            :         /*
    3028                 :            :          * If fixup_owner() returned an error, proprogate that.  If it acquired
    3029                 :            :          * the lock, clear our -ETIMEDOUT or -EINTR.
    3030                 :            :          */
    3031                 :          0 :         if (res)
    3032                 :          0 :                 ret = (res < 0) ? res : 0;
    3033                 :            : 
    3034                 :            :         /*
    3035                 :            :          * If fixup_owner() faulted and was unable to handle the fault, unlock
    3036                 :            :          * it and return the fault to userspace.
    3037                 :            :          */
    3038                 :          0 :         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
    3039                 :            :                 pi_state = q.pi_state;
    3040                 :          0 :                 get_pi_state(pi_state);
    3041                 :            :         }
    3042                 :            : 
    3043                 :            :         /* Unqueue and drop the lock */
    3044                 :          0 :         unqueue_me_pi(&q);
    3045                 :            : 
    3046                 :          0 :         if (pi_state) {
    3047                 :          0 :                 rt_mutex_futex_unlock(&pi_state->pi_mutex);
    3048                 :          0 :                 put_pi_state(pi_state);
    3049                 :            :         }
    3050                 :            : 
    3051                 :            :         goto out_put_key;
    3052                 :            : 
    3053                 :            : out_unlock_put_key:
    3054                 :          0 :         queue_unlock(hb);
    3055                 :            : 
    3056                 :            : out_put_key:
    3057                 :            :         put_futex_key(&q.key);
    3058                 :            : out:
    3059                 :          0 :         if (to) {
    3060                 :          0 :                 hrtimer_cancel(&to->timer);
    3061                 :            :                 destroy_hrtimer_on_stack(&to->timer);
    3062                 :            :         }
    3063                 :          0 :         return ret != -EINTR ? ret : -ERESTARTNOINTR;
    3064                 :            : 
    3065                 :            : uaddr_faulted:
    3066                 :          0 :         queue_unlock(hb);
    3067                 :            : 
    3068                 :          0 :         ret = fault_in_user_writeable(uaddr);
    3069                 :          0 :         if (ret)
    3070                 :            :                 goto out_put_key;
    3071                 :            : 
    3072                 :          0 :         if (!(flags & FLAGS_SHARED))
    3073                 :            :                 goto retry_private;
    3074                 :            : 
    3075                 :            :         put_futex_key(&q.key);
    3076                 :            :         goto retry;
    3077                 :            : }
    3078                 :            : 
    3079                 :            : /*
    3080                 :            :  * Userspace attempted a TID -> 0 atomic transition, and failed.
    3081                 :            :  * This is the in-kernel slowpath: we look up the PI state (if any),
    3082                 :            :  * and do the rt-mutex unlock.
    3083                 :            :  */
    3084                 :          1 : static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
    3085                 :            : {
    3086                 :          1 :         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
    3087                 :          1 :         union futex_key key = FUTEX_KEY_INIT;
    3088                 :            :         struct futex_hash_bucket *hb;
    3089                 :            :         struct futex_q *top_waiter;
    3090                 :            :         int ret;
    3091                 :            : 
    3092                 :            :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    3093                 :            :                 return -ENOSYS;
    3094                 :            : 
    3095                 :            : retry:
    3096                 :          1 :         if (get_user(uval, uaddr))
    3097                 :            :                 return -EFAULT;
    3098                 :            :         /*
    3099                 :            :          * We release only a lock we actually own:
    3100                 :            :          */
    3101                 :          1 :         if ((uval & FUTEX_TID_MASK) != vpid)
    3102                 :            :                 return -EPERM;
    3103                 :            : 
    3104                 :          0 :         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
    3105                 :          0 :         if (ret)
    3106                 :          0 :                 return ret;
    3107                 :            : 
    3108                 :          0 :         hb = hash_futex(&key);
    3109                 :            :         spin_lock(&hb->lock);
    3110                 :            : 
    3111                 :            :         /*
    3112                 :            :          * Check waiters first. We do not trust user space values at
    3113                 :            :          * all and we at least want to know if user space fiddled
    3114                 :            :          * with the futex value instead of blindly unlocking.
    3115                 :            :          */
    3116                 :          0 :         top_waiter = futex_top_waiter(hb, &key);
    3117                 :          0 :         if (top_waiter) {
    3118                 :          0 :                 struct futex_pi_state *pi_state = top_waiter->pi_state;
    3119                 :            : 
    3120                 :            :                 ret = -EINVAL;
    3121                 :          0 :                 if (!pi_state)
    3122                 :            :                         goto out_unlock;
    3123                 :            : 
    3124                 :            :                 /*
    3125                 :            :                  * If current does not own the pi_state then the futex is
    3126                 :            :                  * inconsistent and user space fiddled with the futex value.
    3127                 :            :                  */
    3128                 :          0 :                 if (pi_state->owner != current)
    3129                 :            :                         goto out_unlock;
    3130                 :            : 
    3131                 :          0 :                 get_pi_state(pi_state);
    3132                 :            :                 /*
    3133                 :            :                  * By taking wait_lock while still holding hb->lock, we ensure
    3134                 :            :                  * there is no point where we hold neither; and therefore
    3135                 :            :                  * wake_futex_pi() must observe a state consistent with what we
    3136                 :            :                  * observed.
    3137                 :            :                  *
    3138                 :            :                  * In particular; this forces __rt_mutex_start_proxy() to
    3139                 :            :                  * complete such that we're guaranteed to observe the
    3140                 :            :                  * rt_waiter. Also see the WARN in wake_futex_pi().
    3141                 :            :                  */
    3142                 :          0 :                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
    3143                 :            :                 spin_unlock(&hb->lock);
    3144                 :            : 
    3145                 :            :                 /* drops pi_state->pi_mutex.wait_lock */
    3146                 :          0 :                 ret = wake_futex_pi(uaddr, uval, pi_state);
    3147                 :            : 
    3148                 :          0 :                 put_pi_state(pi_state);
    3149                 :            : 
    3150                 :            :                 /*
    3151                 :            :                  * Success, we're done! No tricky corner cases.
    3152                 :            :                  */
    3153                 :          0 :                 if (!ret)
    3154                 :            :                         goto out_putkey;
    3155                 :            :                 /*
    3156                 :            :                  * The atomic access to the futex value generated a
    3157                 :            :                  * pagefault, so retry the user-access and the wakeup:
    3158                 :            :                  */
    3159                 :          0 :                 if (ret == -EFAULT)
    3160                 :            :                         goto pi_faulted;
    3161                 :            :                 /*
    3162                 :            :                  * A unconditional UNLOCK_PI op raced against a waiter
    3163                 :            :                  * setting the FUTEX_WAITERS bit. Try again.
    3164                 :            :                  */
    3165                 :          0 :                 if (ret == -EAGAIN)
    3166                 :            :                         goto pi_retry;
    3167                 :            :                 /*
    3168                 :            :                  * wake_futex_pi has detected invalid state. Tell user
    3169                 :            :                  * space.
    3170                 :            :                  */
    3171                 :            :                 goto out_putkey;
    3172                 :            :         }
    3173                 :            : 
    3174                 :            :         /*
    3175                 :            :          * We have no kernel internal state, i.e. no waiters in the
    3176                 :            :          * kernel. Waiters which are about to queue themselves are stuck
    3177                 :            :          * on hb->lock. So we can safely ignore them. We do neither
    3178                 :            :          * preserve the WAITERS bit not the OWNER_DIED one. We are the
    3179                 :            :          * owner.
    3180                 :            :          */
    3181                 :          0 :         if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
    3182                 :            :                 spin_unlock(&hb->lock);
    3183                 :          0 :                 switch (ret) {
    3184                 :            :                 case -EFAULT:
    3185                 :            :                         goto pi_faulted;
    3186                 :            : 
    3187                 :            :                 case -EAGAIN:
    3188                 :            :                         goto pi_retry;
    3189                 :            : 
    3190                 :            :                 default:
    3191                 :          0 :                         WARN_ON_ONCE(1);
    3192                 :            :                         goto out_putkey;
    3193                 :            :                 }
    3194                 :            :         }
    3195                 :            : 
    3196                 :            :         /*
    3197                 :            :          * If uval has changed, let user space handle it.
    3198                 :            :          */
    3199                 :          0 :         ret = (curval == uval) ? 0 : -EAGAIN;
    3200                 :            : 
    3201                 :            : out_unlock:
    3202                 :            :         spin_unlock(&hb->lock);
    3203                 :            : out_putkey:
    3204                 :            :         put_futex_key(&key);
    3205                 :          0 :         return ret;
    3206                 :            : 
    3207                 :            : pi_retry:
    3208                 :            :         put_futex_key(&key);
    3209                 :          0 :         cond_resched();
    3210                 :          0 :         goto retry;
    3211                 :            : 
    3212                 :            : pi_faulted:
    3213                 :            :         put_futex_key(&key);
    3214                 :            : 
    3215                 :          0 :         ret = fault_in_user_writeable(uaddr);
    3216                 :          0 :         if (!ret)
    3217                 :            :                 goto retry;
    3218                 :            : 
    3219                 :          0 :         return ret;
    3220                 :            : }
    3221                 :            : 
    3222                 :            : /**
    3223                 :            :  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
    3224                 :            :  * @hb:         the hash_bucket futex_q was original enqueued on
    3225                 :            :  * @q:          the futex_q woken while waiting to be requeued
    3226                 :            :  * @key2:       the futex_key of the requeue target futex
    3227                 :            :  * @timeout:    the timeout associated with the wait (NULL if none)
    3228                 :            :  *
    3229                 :            :  * Detect if the task was woken on the initial futex as opposed to the requeue
    3230                 :            :  * target futex.  If so, determine if it was a timeout or a signal that caused
    3231                 :            :  * the wakeup and return the appropriate error code to the caller.  Must be
    3232                 :            :  * called with the hb lock held.
    3233                 :            :  *
    3234                 :            :  * Return:
    3235                 :            :  *  -  0 = no early wakeup detected;
    3236                 :            :  *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
    3237                 :            :  */
    3238                 :            : static inline
    3239                 :          0 : int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
    3240                 :            :                                    struct futex_q *q, union futex_key *key2,
    3241                 :            :                                    struct hrtimer_sleeper *timeout)
    3242                 :            : {
    3243                 :            :         int ret = 0;
    3244                 :            : 
    3245                 :            :         /*
    3246                 :            :          * With the hb lock held, we avoid races while we process the wakeup.
    3247                 :            :          * We only need to hold hb (and not hb2) to ensure atomicity as the
    3248                 :            :          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
    3249                 :            :          * It can't be requeued from uaddr2 to something else since we don't
    3250                 :            :          * support a PI aware source futex for requeue.
    3251                 :            :          */
    3252                 :          0 :         if (!match_futex(&q->key, key2)) {
    3253                 :          0 :                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
    3254                 :            :                 /*
    3255                 :            :                  * We were woken prior to requeue by a timeout or a signal.
    3256                 :            :                  * Unqueue the futex_q and determine which it was.
    3257                 :            :                  */
    3258                 :          0 :                 plist_del(&q->list, &hb->chain);
    3259                 :            :                 hb_waiters_dec(hb);
    3260                 :            : 
    3261                 :            :                 /* Handle spurious wakeups gracefully */
    3262                 :            :                 ret = -EWOULDBLOCK;
    3263                 :          0 :                 if (timeout && !timeout->task)
    3264                 :            :                         ret = -ETIMEDOUT;
    3265                 :          0 :                 else if (signal_pending(current))
    3266                 :            :                         ret = -ERESTARTNOINTR;
    3267                 :            :         }
    3268                 :          0 :         return ret;
    3269                 :            : }
    3270                 :            : 
    3271                 :            : /**
    3272                 :            :  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
    3273                 :            :  * @uaddr:      the futex we initially wait on (non-pi)
    3274                 :            :  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
    3275                 :            :  *              the same type, no requeueing from private to shared, etc.
    3276                 :            :  * @val:        the expected value of uaddr
    3277                 :            :  * @abs_time:   absolute timeout
    3278                 :            :  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
    3279                 :            :  * @uaddr2:     the pi futex we will take prior to returning to user-space
    3280                 :            :  *
    3281                 :            :  * The caller will wait on uaddr and will be requeued by futex_requeue() to
    3282                 :            :  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
    3283                 :            :  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
    3284                 :            :  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
    3285                 :            :  * without one, the pi logic would not know which task to boost/deboost, if
    3286                 :            :  * there was a need to.
    3287                 :            :  *
    3288                 :            :  * We call schedule in futex_wait_queue_me() when we enqueue and return there
    3289                 :            :  * via the following--
    3290                 :            :  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
    3291                 :            :  * 2) wakeup on uaddr2 after a requeue
    3292                 :            :  * 3) signal
    3293                 :            :  * 4) timeout
    3294                 :            :  *
    3295                 :            :  * If 3, cleanup and return -ERESTARTNOINTR.
    3296                 :            :  *
    3297                 :            :  * If 2, we may then block on trying to take the rt_mutex and return via:
    3298                 :            :  * 5) successful lock
    3299                 :            :  * 6) signal
    3300                 :            :  * 7) timeout
    3301                 :            :  * 8) other lock acquisition failure
    3302                 :            :  *
    3303                 :            :  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
    3304                 :            :  *
    3305                 :            :  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
    3306                 :            :  *
    3307                 :            :  * Return:
    3308                 :            :  *  -  0 - On success;
    3309                 :            :  *  - <0 - On error
    3310                 :            :  */
    3311                 :          0 : static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
    3312                 :            :                                  u32 val, ktime_t *abs_time, u32 bitset,
    3313                 :            :                                  u32 __user *uaddr2)
    3314                 :            : {
    3315                 :            :         struct hrtimer_sleeper timeout, *to;
    3316                 :            :         struct futex_pi_state *pi_state = NULL;
    3317                 :            :         struct rt_mutex_waiter rt_waiter;
    3318                 :            :         struct futex_hash_bucket *hb;
    3319                 :          0 :         union futex_key key2 = FUTEX_KEY_INIT;
    3320                 :          0 :         struct futex_q q = futex_q_init;
    3321                 :            :         int res, ret;
    3322                 :            : 
    3323                 :            :         if (!IS_ENABLED(CONFIG_FUTEX_PI))
    3324                 :            :                 return -ENOSYS;
    3325                 :            : 
    3326                 :          0 :         if (uaddr == uaddr2)
    3327                 :            :                 return -EINVAL;
    3328                 :            : 
    3329                 :          0 :         if (!bitset)
    3330                 :            :                 return -EINVAL;
    3331                 :            : 
    3332                 :          0 :         to = futex_setup_timer(abs_time, &timeout, flags,
    3333                 :          0 :                                current->timer_slack_ns);
    3334                 :            : 
    3335                 :            :         /*
    3336                 :            :          * The waiter is allocated on our stack, manipulated by the requeue
    3337                 :            :          * code while we sleep on uaddr.
    3338                 :            :          */
    3339                 :          0 :         rt_mutex_init_waiter(&rt_waiter);
    3340                 :            : 
    3341                 :          0 :         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
    3342                 :          0 :         if (unlikely(ret != 0))
    3343                 :            :                 goto out;
    3344                 :            : 
    3345                 :          0 :         q.bitset = bitset;
    3346                 :          0 :         q.rt_waiter = &rt_waiter;
    3347                 :          0 :         q.requeue_pi_key = &key2;
    3348                 :            : 
    3349                 :            :         /*
    3350                 :            :          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
    3351                 :            :          * count.
    3352                 :            :          */
    3353                 :          0 :         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
    3354                 :          0 :         if (ret)
    3355                 :            :                 goto out_key2;
    3356                 :            : 
    3357                 :            :         /*
    3358                 :            :          * The check above which compares uaddrs is not sufficient for
    3359                 :            :          * shared futexes. We need to compare the keys:
    3360                 :            :          */
    3361                 :          0 :         if (match_futex(&q.key, &key2)) {
    3362                 :          0 :                 queue_unlock(hb);
    3363                 :            :                 ret = -EINVAL;
    3364                 :          0 :                 goto out_put_keys;
    3365                 :            :         }
    3366                 :            : 
    3367                 :            :         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
    3368                 :          0 :         futex_wait_queue_me(hb, &q, to);
    3369                 :            : 
    3370                 :          0 :         spin_lock(&hb->lock);
    3371                 :          0 :         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
    3372                 :          0 :         spin_unlock(&hb->lock);
    3373                 :          0 :         if (ret)
    3374                 :            :                 goto out_put_keys;
    3375                 :            : 
    3376                 :            :         /*
    3377                 :            :          * In order for us to be here, we know our q.key == key2, and since
    3378                 :            :          * we took the hb->lock above, we also know that futex_requeue() has
    3379                 :            :          * completed and we no longer have to concern ourselves with a wakeup
    3380                 :            :          * race with the atomic proxy lock acquisition by the requeue code. The
    3381                 :            :          * futex_requeue dropped our key1 reference and incremented our key2
    3382                 :            :          * reference count.
    3383                 :            :          */
    3384                 :            : 
    3385                 :            :         /* Check if the requeue code acquired the second futex for us. */
    3386                 :          0 :         if (!q.rt_waiter) {
    3387                 :            :                 /*
    3388                 :            :                  * Got the lock. We might not be the anticipated owner if we
    3389                 :            :                  * did a lock-steal - fix up the PI-state in that case.
    3390                 :            :                  */
    3391                 :          0 :                 if (q.pi_state && (q.pi_state->owner != current)) {
    3392                 :          0 :                         spin_lock(q.lock_ptr);
    3393                 :          0 :                         ret = fixup_pi_state_owner(uaddr2, &q, current);
    3394                 :          0 :                         if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
    3395                 :            :                                 pi_state = q.pi_state;
    3396                 :          0 :                                 get_pi_state(pi_state);
    3397                 :            :                         }
    3398                 :            :                         /*
    3399                 :            :                          * Drop the reference to the pi state which
    3400                 :            :                          * the requeue_pi() code acquired for us.
    3401                 :            :                          */
    3402                 :          0 :                         put_pi_state(q.pi_state);
    3403                 :          0 :                         spin_unlock(q.lock_ptr);
    3404                 :            :                 }
    3405                 :            :         } else {
    3406                 :            :                 struct rt_mutex *pi_mutex;
    3407                 :            : 
    3408                 :            :                 /*
    3409                 :            :                  * We have been woken up by futex_unlock_pi(), a timeout, or a
    3410                 :            :                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
    3411                 :            :                  * the pi_state.
    3412                 :            :                  */
    3413                 :          0 :                 WARN_ON(!q.pi_state);
    3414                 :          0 :                 pi_mutex = &q.pi_state->pi_mutex;
    3415                 :          0 :                 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
    3416                 :            : 
    3417                 :          0 :                 spin_lock(q.lock_ptr);
    3418                 :          0 :                 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
    3419                 :            :                         ret = 0;
    3420                 :            : 
    3421                 :            :                 debug_rt_mutex_free_waiter(&rt_waiter);
    3422                 :            :                 /*
    3423                 :            :                  * Fixup the pi_state owner and possibly acquire the lock if we
    3424                 :            :                  * haven't already.
    3425                 :            :                  */
    3426                 :          0 :                 res = fixup_owner(uaddr2, &q, !ret);
    3427                 :            :                 /*
    3428                 :            :                  * If fixup_owner() returned an error, proprogate that.  If it
    3429                 :            :                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
    3430                 :            :                  */
    3431                 :          0 :                 if (res)
    3432                 :          0 :                         ret = (res < 0) ? res : 0;
    3433                 :            : 
    3434                 :            :                 /*
    3435                 :            :                  * If fixup_pi_state_owner() faulted and was unable to handle
    3436                 :            :                  * the fault, unlock the rt_mutex and return the fault to
    3437                 :            :                  * userspace.
    3438                 :            :                  */
    3439                 :          0 :                 if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
    3440                 :            :                         pi_state = q.pi_state;
    3441                 :          0 :                         get_pi_state(pi_state);
    3442                 :            :                 }
    3443                 :            : 
    3444                 :            :                 /* Unqueue and drop the lock. */
    3445                 :          0 :                 unqueue_me_pi(&q);
    3446                 :            :         }
    3447                 :            : 
    3448                 :          0 :         if (pi_state) {
    3449                 :          0 :                 rt_mutex_futex_unlock(&pi_state->pi_mutex);
    3450                 :          0 :                 put_pi_state(pi_state);
    3451                 :            :         }
    3452                 :            : 
    3453                 :          0 :         if (ret == -EINTR) {
    3454                 :            :                 /*
    3455                 :            :                  * We've already been requeued, but cannot restart by calling
    3456                 :            :                  * futex_lock_pi() directly. We could restart this syscall, but
    3457                 :            :                  * it would detect that the user space "val" changed and return
    3458                 :            :                  * -EWOULDBLOCK.  Save the overhead of the restart and return
    3459                 :            :                  * -EWOULDBLOCK directly.
    3460                 :            :                  */
    3461                 :            :                 ret = -EWOULDBLOCK;
    3462                 :            :         }
    3463                 :            : 
    3464                 :            : out_put_keys:
    3465                 :            :         put_futex_key(&q.key);
    3466                 :            : out_key2:
    3467                 :            :         put_futex_key(&key2);
    3468                 :            : 
    3469                 :            : out:
    3470                 :          0 :         if (to) {
    3471                 :          0 :                 hrtimer_cancel(&to->timer);
    3472                 :            :                 destroy_hrtimer_on_stack(&to->timer);
    3473                 :            :         }
    3474                 :          0 :         return ret;
    3475                 :            : }
    3476                 :            : 
    3477                 :            : /*
    3478                 :            :  * Support for robust futexes: the kernel cleans up held futexes at
    3479                 :            :  * thread exit time.
    3480                 :            :  *
    3481                 :            :  * Implementation: user-space maintains a per-thread list of locks it
    3482                 :            :  * is holding. Upon do_exit(), the kernel carefully walks this list,
    3483                 :            :  * and marks all locks that are owned by this thread with the
    3484                 :            :  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
    3485                 :            :  * always manipulated with the lock held, so the list is private and
    3486                 :            :  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
    3487                 :            :  * field, to allow the kernel to clean up if the thread dies after
    3488                 :            :  * acquiring the lock, but just before it could have added itself to
    3489                 :            :  * the list. There can only be one such pending lock.
    3490                 :            :  */
    3491                 :            : 
    3492                 :            : /**
    3493                 :            :  * sys_set_robust_list() - Set the robust-futex list head of a task
    3494                 :            :  * @head:       pointer to the list-head
    3495                 :            :  * @len:        length of the list-head, as userspace expects
    3496                 :            :  */
    3497                 :          3 : SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
    3498                 :            :                 size_t, len)
    3499                 :            : {
    3500                 :          3 :         if (!futex_cmpxchg_enabled)
    3501                 :            :                 return -ENOSYS;
    3502                 :            :         /*
    3503                 :            :          * The kernel knows only one size for now:
    3504                 :            :          */
    3505                 :          3 :         if (unlikely(len != sizeof(*head)))
    3506                 :            :                 return -EINVAL;
    3507                 :            : 
    3508                 :          3 :         current->robust_list = head;
    3509                 :            : 
    3510                 :            :         return 0;
    3511                 :            : }
    3512                 :            : 
    3513                 :            : /**
    3514                 :            :  * sys_get_robust_list() - Get the robust-futex list head of a task
    3515                 :            :  * @pid:        pid of the process [zero for current task]
    3516                 :            :  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
    3517                 :            :  * @len_ptr:    pointer to a length field, the kernel fills in the header size
    3518                 :            :  */
    3519                 :          0 : SYSCALL_DEFINE3(get_robust_list, int, pid,
    3520                 :            :                 struct robust_list_head __user * __user *, head_ptr,
    3521                 :            :                 size_t __user *, len_ptr)
    3522                 :            : {
    3523                 :            :         struct robust_list_head __user *head;
    3524                 :            :         unsigned long ret;
    3525                 :            :         struct task_struct *p;
    3526                 :            : 
    3527                 :          0 :         if (!futex_cmpxchg_enabled)
    3528                 :            :                 return -ENOSYS;
    3529                 :            : 
    3530                 :            :         rcu_read_lock();
    3531                 :            : 
    3532                 :            :         ret = -ESRCH;
    3533                 :          0 :         if (!pid)
    3534                 :          0 :                 p = current;
    3535                 :            :         else {
    3536                 :          0 :                 p = find_task_by_vpid(pid);
    3537                 :          0 :                 if (!p)
    3538                 :            :                         goto err_unlock;
    3539                 :            :         }
    3540                 :            : 
    3541                 :            :         ret = -EPERM;
    3542                 :          0 :         if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
    3543                 :            :                 goto err_unlock;
    3544                 :            : 
    3545                 :          0 :         head = p->robust_list;
    3546                 :            :         rcu_read_unlock();
    3547                 :            : 
    3548                 :          0 :         if (put_user(sizeof(*head), len_ptr))
    3549                 :            :                 return -EFAULT;
    3550                 :          0 :         return put_user(head, head_ptr);
    3551                 :            : 
    3552                 :            : err_unlock:
    3553                 :            :         rcu_read_unlock();
    3554                 :            : 
    3555                 :          0 :         return ret;
    3556                 :            : }
    3557                 :            : 
    3558                 :            : /* Constants for the pending_op argument of handle_futex_death */
    3559                 :            : #define HANDLE_DEATH_PENDING    true
    3560                 :            : #define HANDLE_DEATH_LIST       false
    3561                 :            : 
    3562                 :            : /*
    3563                 :            :  * Process a futex-list entry, check whether it's owned by the
    3564                 :            :  * dying task, and do notification if so:
    3565                 :            :  */
    3566                 :          0 : static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
    3567                 :            :                               bool pi, bool pending_op)
    3568                 :            : {
    3569                 :            :         u32 uval, uninitialized_var(nval), mval;
    3570                 :            :         int err;
    3571                 :            : 
    3572                 :            :         /* Futex address must be 32bit aligned */
    3573                 :          0 :         if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
    3574                 :            :                 return -1;
    3575                 :            : 
    3576                 :            : retry:
    3577                 :          0 :         if (get_user(uval, uaddr))
    3578                 :            :                 return -1;
    3579                 :            : 
    3580                 :            :         /*
    3581                 :            :          * Special case for regular (non PI) futexes. The unlock path in
    3582                 :            :          * user space has two race scenarios:
    3583                 :            :          *
    3584                 :            :          * 1. The unlock path releases the user space futex value and
    3585                 :            :          *    before it can execute the futex() syscall to wake up
    3586                 :            :          *    waiters it is killed.
    3587                 :            :          *
    3588                 :            :          * 2. A woken up waiter is killed before it can acquire the
    3589                 :            :          *    futex in user space.
    3590                 :            :          *
    3591                 :            :          * In both cases the TID validation below prevents a wakeup of
    3592                 :            :          * potential waiters which can cause these waiters to block
    3593                 :            :          * forever.
    3594                 :            :          *
    3595                 :            :          * In both cases the following conditions are met:
    3596                 :            :          *
    3597                 :            :          *      1) task->robust_list->list_op_pending != NULL
    3598                 :            :          *         @pending_op == true
    3599                 :            :          *      2) User space futex value == 0
    3600                 :            :          *      3) Regular futex: @pi == false
    3601                 :            :          *
    3602                 :            :          * If these conditions are met, it is safe to attempt waking up a
    3603                 :            :          * potential waiter without touching the user space futex value and
    3604                 :            :          * trying to set the OWNER_DIED bit. The user space futex value is
    3605                 :            :          * uncontended and the rest of the user space mutex state is
    3606                 :            :          * consistent, so a woken waiter will just take over the
    3607                 :            :          * uncontended futex. Setting the OWNER_DIED bit would create
    3608                 :            :          * inconsistent state and malfunction of the user space owner died
    3609                 :            :          * handling.
    3610                 :            :          */
    3611                 :          0 :         if (pending_op && !pi && !uval) {
    3612                 :          0 :                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    3613                 :          0 :                 return 0;
    3614                 :            :         }
    3615                 :            : 
    3616                 :          0 :         if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
    3617                 :            :                 return 0;
    3618                 :            : 
    3619                 :            :         /*
    3620                 :            :          * Ok, this dying thread is truly holding a futex
    3621                 :            :          * of interest. Set the OWNER_DIED bit atomically
    3622                 :            :          * via cmpxchg, and if the value had FUTEX_WAITERS
    3623                 :            :          * set, wake up a waiter (if any). (We have to do a
    3624                 :            :          * futex_wake() even if OWNER_DIED is already set -
    3625                 :            :          * to handle the rare but possible case of recursive
    3626                 :            :          * thread-death.) The rest of the cleanup is done in
    3627                 :            :          * userspace.
    3628                 :            :          */
    3629                 :          0 :         mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
    3630                 :            : 
    3631                 :            :         /*
    3632                 :            :          * We are not holding a lock here, but we want to have
    3633                 :            :          * the pagefault_disable/enable() protection because
    3634                 :            :          * we want to handle the fault gracefully. If the
    3635                 :            :          * access fails we try to fault in the futex with R/W
    3636                 :            :          * verification via get_user_pages. get_user() above
    3637                 :            :          * does not guarantee R/W access. If that fails we
    3638                 :            :          * give up and leave the futex locked.
    3639                 :            :          */
    3640                 :          0 :         if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
    3641                 :          0 :                 switch (err) {
    3642                 :            :                 case -EFAULT:
    3643                 :          0 :                         if (fault_in_user_writeable(uaddr))
    3644                 :            :                                 return -1;
    3645                 :            :                         goto retry;
    3646                 :            : 
    3647                 :            :                 case -EAGAIN:
    3648                 :          0 :                         cond_resched();
    3649                 :          0 :                         goto retry;
    3650                 :            : 
    3651                 :            :                 default:
    3652                 :          0 :                         WARN_ON_ONCE(1);
    3653                 :          0 :                         return err;
    3654                 :            :                 }
    3655                 :            :         }
    3656                 :            : 
    3657                 :          0 :         if (nval != uval)
    3658                 :            :                 goto retry;
    3659                 :            : 
    3660                 :            :         /*
    3661                 :            :          * Wake robust non-PI futexes here. The wakeup of
    3662                 :            :          * PI futexes happens in exit_pi_state():
    3663                 :            :          */
    3664                 :          0 :         if (!pi && (uval & FUTEX_WAITERS))
    3665                 :          0 :                 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
    3666                 :            : 
    3667                 :            :         return 0;
    3668                 :            : }
    3669                 :            : 
    3670                 :            : /*
    3671                 :            :  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    3672                 :            :  */
    3673                 :          3 : static inline int fetch_robust_entry(struct robust_list __user **entry,
    3674                 :            :                                      struct robust_list __user * __user *head,
    3675                 :            :                                      unsigned int *pi)
    3676                 :            : {
    3677                 :            :         unsigned long uentry;
    3678                 :            : 
    3679                 :          3 :         if (get_user(uentry, (unsigned long __user *)head))
    3680                 :            :                 return -EFAULT;
    3681                 :            : 
    3682                 :          3 :         *entry = (void __user *)(uentry & ~1UL);
    3683                 :          3 :         *pi = uentry & 1;
    3684                 :            : 
    3685                 :          3 :         return 0;
    3686                 :            : }
    3687                 :            : 
    3688                 :            : /*
    3689                 :            :  * Walk curr->robust_list (very carefully, it's a userspace list!)
    3690                 :            :  * and mark any locks found there dead, and notify any waiters.
    3691                 :            :  *
    3692                 :            :  * We silently return on any sign of list-walking problem.
    3693                 :            :  */
    3694                 :          3 : static void exit_robust_list(struct task_struct *curr)
    3695                 :            : {
    3696                 :          3 :         struct robust_list_head __user *head = curr->robust_list;
    3697                 :            :         struct robust_list __user *entry, *next_entry, *pending;
    3698                 :            :         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    3699                 :            :         unsigned int uninitialized_var(next_pi);
    3700                 :            :         unsigned long futex_offset;
    3701                 :            :         int rc;
    3702                 :            : 
    3703                 :          3 :         if (!futex_cmpxchg_enabled)
    3704                 :          0 :                 return;
    3705                 :            : 
    3706                 :            :         /*
    3707                 :            :          * Fetch the list head (which was registered earlier, via
    3708                 :            :          * sys_set_robust_list()):
    3709                 :            :          */
    3710                 :          3 :         if (fetch_robust_entry(&entry, &head->list.next, &pi))
    3711                 :            :                 return;
    3712                 :            :         /*
    3713                 :            :          * Fetch the relative futex offset:
    3714                 :            :          */
    3715                 :          3 :         if (get_user(futex_offset, &head->futex_offset))
    3716                 :            :                 return;
    3717                 :            :         /*
    3718                 :            :          * Fetch any possibly pending lock-add first, and handle it
    3719                 :            :          * if it exists:
    3720                 :            :          */
    3721                 :          3 :         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
    3722                 :            :                 return;
    3723                 :            : 
    3724                 :          3 :         next_entry = NULL;      /* avoid warning with gcc */
    3725                 :          3 :         while (entry != &head->list) {
    3726                 :            :                 /*
    3727                 :            :                  * Fetch the next entry in the list before calling
    3728                 :            :                  * handle_futex_death:
    3729                 :            :                  */
    3730                 :          0 :                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
    3731                 :            :                 /*
    3732                 :            :                  * A pending lock might already be on the list, so
    3733                 :            :                  * don't process it twice:
    3734                 :            :                  */
    3735                 :          0 :                 if (entry != pending) {
    3736                 :          0 :                         if (handle_futex_death((void __user *)entry + futex_offset,
    3737                 :            :                                                 curr, pi, HANDLE_DEATH_LIST))
    3738                 :            :                                 return;
    3739                 :            :                 }
    3740                 :          0 :                 if (rc)
    3741                 :            :                         return;
    3742                 :          0 :                 entry = next_entry;
    3743                 :          0 :                 pi = next_pi;
    3744                 :            :                 /*
    3745                 :            :                  * Avoid excessively long or circular lists:
    3746                 :            :                  */
    3747                 :          0 :                 if (!--limit)
    3748                 :            :                         break;
    3749                 :            : 
    3750                 :          0 :                 cond_resched();
    3751                 :            :         }
    3752                 :            : 
    3753                 :          3 :         if (pending) {
    3754                 :          0 :                 handle_futex_death((void __user *)pending + futex_offset,
    3755                 :            :                                    curr, pip, HANDLE_DEATH_PENDING);
    3756                 :            :         }
    3757                 :            : }
    3758                 :            : 
    3759                 :          3 : static void futex_cleanup(struct task_struct *tsk)
    3760                 :            : {
    3761                 :          3 :         if (unlikely(tsk->robust_list)) {
    3762                 :          3 :                 exit_robust_list(tsk);
    3763                 :          3 :                 tsk->robust_list = NULL;
    3764                 :            :         }
    3765                 :            : 
    3766                 :            : #ifdef CONFIG_COMPAT
    3767                 :            :         if (unlikely(tsk->compat_robust_list)) {
    3768                 :            :                 compat_exit_robust_list(tsk);
    3769                 :            :                 tsk->compat_robust_list = NULL;
    3770                 :            :         }
    3771                 :            : #endif
    3772                 :            : 
    3773                 :          3 :         if (unlikely(!list_empty(&tsk->pi_state_list)))
    3774                 :          0 :                 exit_pi_state_list(tsk);
    3775                 :          3 : }
    3776                 :            : 
    3777                 :            : /**
    3778                 :            :  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
    3779                 :            :  * @tsk:        task to set the state on
    3780                 :            :  *
    3781                 :            :  * Set the futex exit state of the task lockless. The futex waiter code
    3782                 :            :  * observes that state when a task is exiting and loops until the task has
    3783                 :            :  * actually finished the futex cleanup. The worst case for this is that the
    3784                 :            :  * waiter runs through the wait loop until the state becomes visible.
    3785                 :            :  *
    3786                 :            :  * This is called from the recursive fault handling path in do_exit().
    3787                 :            :  *
    3788                 :            :  * This is best effort. Either the futex exit code has run already or
    3789                 :            :  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
    3790                 :            :  * take it over. If not, the problem is pushed back to user space. If the
    3791                 :            :  * futex exit code did not run yet, then an already queued waiter might
    3792                 :            :  * block forever, but there is nothing which can be done about that.
    3793                 :            :  */
    3794                 :          0 : void futex_exit_recursive(struct task_struct *tsk)
    3795                 :            : {
    3796                 :            :         /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
    3797                 :          0 :         if (tsk->futex_state == FUTEX_STATE_EXITING)
    3798                 :          0 :                 mutex_unlock(&tsk->futex_exit_mutex);
    3799                 :          0 :         tsk->futex_state = FUTEX_STATE_DEAD;
    3800                 :          0 : }
    3801                 :            : 
    3802                 :          3 : static void futex_cleanup_begin(struct task_struct *tsk)
    3803                 :            : {
    3804                 :            :         /*
    3805                 :            :          * Prevent various race issues against a concurrent incoming waiter
    3806                 :            :          * including live locks by forcing the waiter to block on
    3807                 :            :          * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
    3808                 :            :          * attach_to_pi_owner().
    3809                 :            :          */
    3810                 :          3 :         mutex_lock(&tsk->futex_exit_mutex);
    3811                 :            : 
    3812                 :            :         /*
    3813                 :            :          * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
    3814                 :            :          *
    3815                 :            :          * This ensures that all subsequent checks of tsk->futex_state in
    3816                 :            :          * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
    3817                 :            :          * tsk->pi_lock held.
    3818                 :            :          *
    3819                 :            :          * It guarantees also that a pi_state which was queued right before
    3820                 :            :          * the state change under tsk->pi_lock by a concurrent waiter must
    3821                 :            :          * be observed in exit_pi_state_list().
    3822                 :            :          */
    3823                 :          3 :         raw_spin_lock_irq(&tsk->pi_lock);
    3824                 :          3 :         tsk->futex_state = FUTEX_STATE_EXITING;
    3825                 :          3 :         raw_spin_unlock_irq(&tsk->pi_lock);
    3826                 :          3 : }
    3827                 :            : 
    3828                 :            : static void futex_cleanup_end(struct task_struct *tsk, int state)
    3829                 :            : {
    3830                 :            :         /*
    3831                 :            :          * Lockless store. The only side effect is that an observer might
    3832                 :            :          * take another loop until it becomes visible.
    3833                 :            :          */
    3834                 :          3 :         tsk->futex_state = state;
    3835                 :            :         /*
    3836                 :            :          * Drop the exit protection. This unblocks waiters which observed
    3837                 :            :          * FUTEX_STATE_EXITING to reevaluate the state.
    3838                 :            :          */
    3839                 :          3 :         mutex_unlock(&tsk->futex_exit_mutex);
    3840                 :            : }
    3841                 :            : 
    3842                 :          3 : void futex_exec_release(struct task_struct *tsk)
    3843                 :            : {
    3844                 :            :         /*
    3845                 :            :          * The state handling is done for consistency, but in the case of
    3846                 :            :          * exec() there is no way to prevent futher damage as the PID stays
    3847                 :            :          * the same. But for the unlikely and arguably buggy case that a
    3848                 :            :          * futex is held on exec(), this provides at least as much state
    3849                 :            :          * consistency protection which is possible.
    3850                 :            :          */
    3851                 :          3 :         futex_cleanup_begin(tsk);
    3852                 :          3 :         futex_cleanup(tsk);
    3853                 :            :         /*
    3854                 :            :          * Reset the state to FUTEX_STATE_OK. The task is alive and about
    3855                 :            :          * exec a new binary.
    3856                 :            :          */
    3857                 :            :         futex_cleanup_end(tsk, FUTEX_STATE_OK);
    3858                 :          3 : }
    3859                 :            : 
    3860                 :          3 : void futex_exit_release(struct task_struct *tsk)
    3861                 :            : {
    3862                 :          3 :         futex_cleanup_begin(tsk);
    3863                 :          3 :         futex_cleanup(tsk);
    3864                 :            :         futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
    3865                 :          3 : }
    3866                 :            : 
    3867                 :          3 : long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
    3868                 :            :                 u32 __user *uaddr2, u32 val2, u32 val3)
    3869                 :            : {
    3870                 :          3 :         int cmd = op & FUTEX_CMD_MASK;
    3871                 :            :         unsigned int flags = 0;
    3872                 :            : 
    3873                 :          3 :         if (!(op & FUTEX_PRIVATE_FLAG))
    3874                 :            :                 flags |= FLAGS_SHARED;
    3875                 :            : 
    3876                 :          3 :         if (op & FUTEX_CLOCK_REALTIME) {
    3877                 :          3 :                 flags |= FLAGS_CLOCKRT;
    3878                 :          3 :                 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
    3879                 :            :                     cmd != FUTEX_WAIT_REQUEUE_PI)
    3880                 :            :                         return -ENOSYS;
    3881                 :            :         }
    3882                 :            : 
    3883                 :          3 :         switch (cmd) {
    3884                 :            :         case FUTEX_LOCK_PI:
    3885                 :            :         case FUTEX_UNLOCK_PI:
    3886                 :            :         case FUTEX_TRYLOCK_PI:
    3887                 :            :         case FUTEX_WAIT_REQUEUE_PI:
    3888                 :            :         case FUTEX_CMP_REQUEUE_PI:
    3889                 :          1 :                 if (!futex_cmpxchg_enabled)
    3890                 :            :                         return -ENOSYS;
    3891                 :            :         }
    3892                 :            : 
    3893                 :          3 :         switch (cmd) {
    3894                 :            :         case FUTEX_WAIT:
    3895                 :          3 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3896                 :            :                 /* fall through */
    3897                 :            :         case FUTEX_WAIT_BITSET:
    3898                 :          3 :                 return futex_wait(uaddr, flags, val, timeout, val3);
    3899                 :            :         case FUTEX_WAKE:
    3900                 :          3 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3901                 :            :                 /* fall through */
    3902                 :            :         case FUTEX_WAKE_BITSET:
    3903                 :          3 :                 return futex_wake(uaddr, flags, val, val3);
    3904                 :            :         case FUTEX_REQUEUE:
    3905                 :          0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
    3906                 :            :         case FUTEX_CMP_REQUEUE:
    3907                 :          0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
    3908                 :            :         case FUTEX_WAKE_OP:
    3909                 :          0 :                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
    3910                 :            :         case FUTEX_LOCK_PI:
    3911                 :          0 :                 return futex_lock_pi(uaddr, flags, timeout, 0);
    3912                 :            :         case FUTEX_UNLOCK_PI:
    3913                 :          1 :                 return futex_unlock_pi(uaddr, flags);
    3914                 :            :         case FUTEX_TRYLOCK_PI:
    3915                 :          0 :                 return futex_lock_pi(uaddr, flags, NULL, 1);
    3916                 :            :         case FUTEX_WAIT_REQUEUE_PI:
    3917                 :          0 :                 val3 = FUTEX_BITSET_MATCH_ANY;
    3918                 :          0 :                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
    3919                 :            :                                              uaddr2);
    3920                 :            :         case FUTEX_CMP_REQUEUE_PI:
    3921                 :          0 :                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
    3922                 :            :         }
    3923                 :            :         return -ENOSYS;
    3924                 :            : }
    3925                 :            : 
    3926                 :            : 
    3927                 :          0 : SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
    3928                 :            :                 struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
    3929                 :            :                 u32, val3)
    3930                 :            : {
    3931                 :            :         struct timespec64 ts;
    3932                 :            :         ktime_t t, *tp = NULL;
    3933                 :            :         u32 val2 = 0;
    3934                 :          0 :         int cmd = op & FUTEX_CMD_MASK;
    3935                 :            : 
    3936                 :          0 :         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
    3937                 :          0 :                       cmd == FUTEX_WAIT_BITSET ||
    3938                 :          0 :                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
    3939                 :            :                 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
    3940                 :            :                         return -EFAULT;
    3941                 :          0 :                 if (get_timespec64(&ts, utime))
    3942                 :            :                         return -EFAULT;
    3943                 :          0 :                 if (!timespec64_valid(&ts))
    3944                 :            :                         return -EINVAL;
    3945                 :            : 
    3946                 :          0 :                 t = timespec64_to_ktime(ts);
    3947                 :          0 :                 if (cmd == FUTEX_WAIT)
    3948                 :          0 :                         t = ktime_add_safe(ktime_get(), t);
    3949                 :            :                 tp = &t;
    3950                 :            :         }
    3951                 :            :         /*
    3952                 :            :          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
    3953                 :            :          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
    3954                 :            :          */
    3955                 :          0 :         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
    3956                 :          0 :             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
    3957                 :          0 :                 val2 = (u32) (unsigned long) utime;
    3958                 :            : 
    3959                 :          0 :         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
    3960                 :            : }
    3961                 :            : 
    3962                 :            : #ifdef CONFIG_COMPAT
    3963                 :            : /*
    3964                 :            :  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
    3965                 :            :  */
    3966                 :            : static inline int
    3967                 :            : compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
    3968                 :            :                    compat_uptr_t __user *head, unsigned int *pi)
    3969                 :            : {
    3970                 :            :         if (get_user(*uentry, head))
    3971                 :            :                 return -EFAULT;
    3972                 :            : 
    3973                 :            :         *entry = compat_ptr((*uentry) & ~1);
    3974                 :            :         *pi = (unsigned int)(*uentry) & 1;
    3975                 :            : 
    3976                 :            :         return 0;
    3977                 :            : }
    3978                 :            : 
    3979                 :            : static void __user *futex_uaddr(struct robust_list __user *entry,
    3980                 :            :                                 compat_long_t futex_offset)
    3981                 :            : {
    3982                 :            :         compat_uptr_t base = ptr_to_compat(entry);
    3983                 :            :         void __user *uaddr = compat_ptr(base + futex_offset);
    3984                 :            : 
    3985                 :            :         return uaddr;
    3986                 :            : }
    3987                 :            : 
    3988                 :            : /*
    3989                 :            :  * Walk curr->robust_list (very carefully, it's a userspace list!)
    3990                 :            :  * and mark any locks found there dead, and notify any waiters.
    3991                 :            :  *
    3992                 :            :  * We silently return on any sign of list-walking problem.
    3993                 :            :  */
    3994                 :            : static void compat_exit_robust_list(struct task_struct *curr)
    3995                 :            : {
    3996                 :            :         struct compat_robust_list_head __user *head = curr->compat_robust_list;
    3997                 :            :         struct robust_list __user *entry, *next_entry, *pending;
    3998                 :            :         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
    3999                 :            :         unsigned int uninitialized_var(next_pi);
    4000                 :            :         compat_uptr_t uentry, next_uentry, upending;
    4001                 :            :         compat_long_t futex_offset;
    4002                 :            :         int rc;
    4003                 :            : 
    4004                 :            :         if (!futex_cmpxchg_enabled)
    4005                 :            :                 return;
    4006                 :            : 
    4007                 :            :         /*
    4008                 :            :          * Fetch the list head (which was registered earlier, via
    4009                 :            :          * sys_set_robust_list()):
    4010                 :            :          */
    4011                 :            :         if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
    4012                 :            :                 return;
    4013                 :            :         /*
    4014                 :            :          * Fetch the relative futex offset:
    4015                 :            :          */
    4016                 :            :         if (get_user(futex_offset, &head->futex_offset))
    4017                 :            :                 return;
    4018                 :            :         /*
    4019                 :            :          * Fetch any possibly pending lock-add first, and handle it
    4020                 :            :          * if it exists:
    4021                 :            :          */
    4022                 :            :         if (compat_fetch_robust_entry(&upending, &pending,
    4023                 :            :                                &head->list_op_pending, &pip))
    4024                 :            :                 return;
    4025                 :            : 
    4026                 :            :         next_entry = NULL;      /* avoid warning with gcc */
    4027                 :            :         while (entry != (struct robust_list __user *) &head->list) {
    4028                 :            :                 /*
    4029                 :            :                  * Fetch the next entry in the list before calling
    4030                 :            :                  * handle_futex_death:
    4031                 :            :                  */
    4032                 :            :                 rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
    4033                 :            :                         (compat_uptr_t __user *)&entry->next, &next_pi);
    4034                 :            :                 /*
    4035                 :            :                  * A pending lock might already be on the list, so
    4036                 :            :                  * dont process it twice:
    4037                 :            :                  */
    4038                 :            :                 if (entry != pending) {
    4039                 :            :                         void __user *uaddr = futex_uaddr(entry, futex_offset);
    4040                 :            : 
    4041                 :            :                         if (handle_futex_death(uaddr, curr, pi,
    4042                 :            :                                                HANDLE_DEATH_LIST))
    4043                 :            :                                 return;
    4044                 :            :                 }
    4045                 :            :                 if (rc)
    4046                 :            :                         return;
    4047                 :            :                 uentry = next_uentry;
    4048                 :            :                 entry = next_entry;
    4049                 :            :                 pi = next_pi;
    4050                 :            :                 /*
    4051                 :            :                  * Avoid excessively long or circular lists:
    4052                 :            :                  */
    4053                 :            :                 if (!--limit)
    4054                 :            :                         break;
    4055                 :            : 
    4056                 :            :                 cond_resched();
    4057                 :            :         }
    4058                 :            :         if (pending) {
    4059                 :            :                 void __user *uaddr = futex_uaddr(pending, futex_offset);
    4060                 :            : 
    4061                 :            :                 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
    4062                 :            :         }
    4063                 :            : }
    4064                 :            : 
    4065                 :            : COMPAT_SYSCALL_DEFINE2(set_robust_list,
    4066                 :            :                 struct compat_robust_list_head __user *, head,
    4067                 :            :                 compat_size_t, len)
    4068                 :            : {
    4069                 :            :         if (!futex_cmpxchg_enabled)
    4070                 :            :                 return -ENOSYS;
    4071                 :            : 
    4072                 :            :         if (unlikely(len != sizeof(*head)))
    4073                 :            :                 return -EINVAL;
    4074                 :            : 
    4075                 :            :         current->compat_robust_list = head;
    4076                 :            : 
    4077                 :            :         return 0;
    4078                 :            : }
    4079                 :            : 
    4080                 :            : COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
    4081                 :            :                         compat_uptr_t __user *, head_ptr,
    4082                 :            :                         compat_size_t __user *, len_ptr)
    4083                 :            : {
    4084                 :            :         struct compat_robust_list_head __user *head;
    4085                 :            :         unsigned long ret;
    4086                 :            :         struct task_struct *p;
    4087                 :            : 
    4088                 :            :         if (!futex_cmpxchg_enabled)
    4089                 :            :                 return -ENOSYS;
    4090                 :            : 
    4091                 :            :         rcu_read_lock();
    4092                 :            : 
    4093                 :            :         ret = -ESRCH;
    4094                 :            :         if (!pid)
    4095                 :            :                 p = current;
    4096                 :            :         else {
    4097                 :            :                 p = find_task_by_vpid(pid);
    4098                 :            :                 if (!p)
    4099                 :            :                         goto err_unlock;
    4100                 :            :         }
    4101                 :            : 
    4102                 :            :         ret = -EPERM;
    4103                 :            :         if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
    4104                 :            :                 goto err_unlock;
    4105                 :            : 
    4106                 :            :         head = p->compat_robust_list;
    4107                 :            :         rcu_read_unlock();
    4108                 :            : 
    4109                 :            :         if (put_user(sizeof(*head), len_ptr))
    4110                 :            :                 return -EFAULT;
    4111                 :            :         return put_user(ptr_to_compat(head), head_ptr);
    4112                 :            : 
    4113                 :            : err_unlock:
    4114                 :            :         rcu_read_unlock();
    4115                 :            : 
    4116                 :            :         return ret;
    4117                 :            : }
    4118                 :            : #endif /* CONFIG_COMPAT */
    4119                 :            : 
    4120                 :            : #ifdef CONFIG_COMPAT_32BIT_TIME
    4121                 :          3 : SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
    4122                 :            :                 struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
    4123                 :            :                 u32, val3)
    4124                 :            : {
    4125                 :            :         struct timespec64 ts;
    4126                 :            :         ktime_t t, *tp = NULL;
    4127                 :            :         int val2 = 0;
    4128                 :          3 :         int cmd = op & FUTEX_CMD_MASK;
    4129                 :            : 
    4130                 :          3 :         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
    4131                 :          0 :                       cmd == FUTEX_WAIT_BITSET ||
    4132                 :          0 :                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
    4133                 :          3 :                 if (get_old_timespec32(&ts, utime))
    4134                 :            :                         return -EFAULT;
    4135                 :          3 :                 if (!timespec64_valid(&ts))
    4136                 :            :                         return -EINVAL;
    4137                 :            : 
    4138                 :          3 :                 t = timespec64_to_ktime(ts);
    4139                 :          3 :                 if (cmd == FUTEX_WAIT)
    4140                 :          3 :                         t = ktime_add_safe(ktime_get(), t);
    4141                 :            :                 tp = &t;
    4142                 :            :         }
    4143                 :          3 :         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
    4144                 :          3 :             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
    4145                 :          0 :                 val2 = (int) (unsigned long) utime;
    4146                 :            : 
    4147                 :          3 :         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
    4148                 :            : }
    4149                 :            : #endif /* CONFIG_COMPAT_32BIT_TIME */
    4150                 :            : 
    4151                 :          3 : static void __init futex_detect_cmpxchg(void)
    4152                 :            : {
    4153                 :            : #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
    4154                 :            :         u32 curval;
    4155                 :            : 
    4156                 :            :         /*
    4157                 :            :          * This will fail and we want it. Some arch implementations do
    4158                 :            :          * runtime detection of the futex_atomic_cmpxchg_inatomic()
    4159                 :            :          * functionality. We want to know that before we call in any
    4160                 :            :          * of the complex code paths. Also we want to prevent
    4161                 :            :          * registration of robust lists in that case. NULL is
    4162                 :            :          * guaranteed to fault and we get -EFAULT on functional
    4163                 :            :          * implementation, the non-functional ones will return
    4164                 :            :          * -ENOSYS.
    4165                 :            :          */
    4166                 :          3 :         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
    4167                 :          3 :                 futex_cmpxchg_enabled = 1;
    4168                 :            : #endif
    4169                 :          3 : }
    4170                 :            : 
    4171                 :          3 : static int __init futex_init(void)
    4172                 :            : {
    4173                 :            :         unsigned int futex_shift;
    4174                 :            :         unsigned long i;
    4175                 :            : 
    4176                 :            : #if CONFIG_BASE_SMALL
    4177                 :            :         futex_hashsize = 16;
    4178                 :            : #else
    4179                 :          3 :         futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
    4180                 :            : #endif
    4181                 :            : 
    4182                 :          3 :         futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
    4183                 :            :                                                futex_hashsize, 0,
    4184                 :            :                                                futex_hashsize < 256 ? HASH_SMALL : 0,
    4185                 :            :                                                &futex_shift, NULL,
    4186                 :            :                                                futex_hashsize, futex_hashsize);
    4187                 :          3 :         futex_hashsize = 1UL << futex_shift;
    4188                 :            : 
    4189                 :          3 :         futex_detect_cmpxchg();
    4190                 :            : 
    4191                 :          3 :         for (i = 0; i < futex_hashsize; i++) {
    4192                 :          3 :                 atomic_set(&futex_queues[i].waiters, 0);
    4193                 :            :                 plist_head_init(&futex_queues[i].chain);
    4194                 :          3 :                 spin_lock_init(&futex_queues[i].lock);
    4195                 :            :         }
    4196                 :            : 
    4197                 :          3 :         return 0;
    4198                 :            : }
    4199                 :            : core_initcall(futex_init);
    

Generated by: LCOV version 1.14