LCOV - combined.info - mm/mempolicy.c

LCOV - code coverage report

Current view:	top level - mm - mempolicy.c (source / functions)		Hit	Total	Coverage
Test:	combined.info	Lines:	193	1184	16.3 %
Date:	2022-04-01 14:58:12	Functions:	21	84	25.0 %
		Branches:	89	819	10.9 %

           Branch data     Line data    Source code

       1                 :            : // SPDX-License-Identifier: GPL-2.0-only
       2                 :            : /*
       3                 :            :  * Simple NUMA memory policy for the Linux kernel.
       4                 :            :  *
       5                 :            :  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
       6                 :            :  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
       7                 :            :  *
       8                 :            :  * NUMA policy allows the user to give hints in which node(s) memory should
       9                 :            :  * be allocated.
      10                 :            :  *
      11                 :            :  * Support four policies per VMA and per process:
      12                 :            :  *
      13                 :            :  * The VMA policy has priority over the process policy for a page fault.
      14                 :            :  *
      15                 :            :  * interleave     Allocate memory interleaved over a set of nodes,
      16                 :            :  *                with normal fallback if it fails.
      17                 :            :  *                For VMA based allocations this interleaves based on the
      18                 :            :  *                offset into the backing object or offset into the mapping
      19                 :            :  *                for anonymous memory. For process policy an process counter
      20                 :            :  *                is used.
      21                 :            :  *
      22                 :            :  * bind           Only allocate memory on a specific set of nodes,
      23                 :            :  *                no fallback.
      24                 :            :  *                FIXME: memory is allocated starting with the first node
      25                 :            :  *                to the last. It would be better if bind would truly restrict
      26                 :            :  *                the allocation to memory nodes instead
      27                 :            :  *
      28                 :            :  * preferred       Try a specific node first before normal fallback.
      29                 :            :  *                As a special case NUMA_NO_NODE here means do the allocation
      30                 :            :  *                on the local CPU. This is normally identical to default,
      31                 :            :  *                but useful to set in a VMA when you have a non default
      32                 :            :  *                process policy.
      33                 :            :  *
      34                 :            :  * default        Allocate on the local node first, or when on a VMA
      35                 :            :  *                use the process policy. This is what Linux always did
      36                 :            :  *                in a NUMA aware kernel and still does by, ahem, default.
      37                 :            :  *
      38                 :            :  * The process policy is applied for most non interrupt memory allocations
      39                 :            :  * in that process' context. Interrupts ignore the policies and always
      40                 :            :  * try to allocate on the local CPU. The VMA policy is only applied for memory
      41                 :            :  * allocations for a VMA in the VM.
      42                 :            :  *
      43                 :            :  * Currently there are a few corner cases in swapping where the policy
      44                 :            :  * is not applied, but the majority should be handled. When process policy
      45                 :            :  * is used it is not remembered over swap outs/swap ins.
      46                 :            :  *
      47                 :            :  * Only the highest zone in the zone hierarchy gets policied. Allocations
      48                 :            :  * requesting a lower zone just use default policy. This implies that
      49                 :            :  * on systems with highmem kernel lowmem allocation don't get policied.
      50                 :            :  * Same with GFP_DMA allocations.
      51                 :            :  *
      52                 :            :  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
      53                 :            :  * all users and remembered even when nobody has memory mapped.
      54                 :            :  */
      55                 :            : 
      56                 :            : /* Notebook:
      57                 :            :    fix mmap readahead to honour policy and enable policy for any page cache
      58                 :            :    object
      59                 :            :    statistics for bigpages
      60                 :            :    global policy for page cache? currently it uses process policy. Requires
      61                 :            :    first item above.
      62                 :            :    handle mremap for shared memory (currently ignored for the policy)
      63                 :            :    grows down?
      64                 :            :    make bind policy root only? It can trigger oom much faster and the
      65                 :            :    kernel is not always grateful with that.
      66                 :            : */
      67                 :            : 
      68                 :            : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      69                 :            : 
      70                 :            : #include <linux/mempolicy.h>
      71                 :            : #include <linux/pagewalk.h>
      72                 :            : #include <linux/highmem.h>
      73                 :            : #include <linux/hugetlb.h>
      74                 :            : #include <linux/kernel.h>
      75                 :            : #include <linux/sched.h>
      76                 :            : #include <linux/sched/mm.h>
      77                 :            : #include <linux/sched/numa_balancing.h>
      78                 :            : #include <linux/sched/task.h>
      79                 :            : #include <linux/nodemask.h>
      80                 :            : #include <linux/cpuset.h>
      81                 :            : #include <linux/slab.h>
      82                 :            : #include <linux/string.h>
      83                 :            : #include <linux/export.h>
      84                 :            : #include <linux/nsproxy.h>
      85                 :            : #include <linux/interrupt.h>
      86                 :            : #include <linux/init.h>
      87                 :            : #include <linux/compat.h>
      88                 :            : #include <linux/ptrace.h>
      89                 :            : #include <linux/swap.h>
      90                 :            : #include <linux/seq_file.h>
      91                 :            : #include <linux/proc_fs.h>
      92                 :            : #include <linux/migrate.h>
      93                 :            : #include <linux/ksm.h>
      94                 :            : #include <linux/rmap.h>
      95                 :            : #include <linux/security.h>
      96                 :            : #include <linux/syscalls.h>
      97                 :            : #include <linux/ctype.h>
      98                 :            : #include <linux/mm_inline.h>
      99                 :            : #include <linux/mmu_notifier.h>
     100                 :            : #include <linux/printk.h>
     101                 :            : #include <linux/swapops.h>
     102                 :            : 
     103                 :            : #include <asm/tlbflush.h>
     104                 :            : #include <linux/uaccess.h>
     105                 :            : 
     106                 :            : #include "internal.h"
     107                 :            : 
     108                 :            : /* Internal flags */
     109                 :            : #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)      /* Skip checks for continuous vmas */
     110                 :            : #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)            /* Invert check for nodemask */
     111                 :            : 
     112                 :            : static struct kmem_cache *policy_cache;
     113                 :            : static struct kmem_cache *sn_cache;
     114                 :            : 
     115                 :            : /* Highest zone. An specific allocation for a zone below that is not
     116                 :            :    policied. */
     117                 :            : enum zone_type policy_zone = 0;
     118                 :            : 
     119                 :            : /*
     120                 :            :  * run-time system-wide default policy => local allocation
     121                 :            :  */
     122                 :            : static struct mempolicy default_policy = {
     123                 :            :         .refcnt = ATOMIC_INIT(1), /* never free it */
     124                 :            :         .mode = MPOL_PREFERRED,
     125                 :            :         .flags = MPOL_F_LOCAL,
     126                 :            : };
     127                 :            : 
     128                 :            : static struct mempolicy preferred_node_policy[MAX_NUMNODES];
     129                 :            : 
     130                 :     344726 : struct mempolicy *get_task_policy(struct task_struct *p)
     131                 :            : {
     132                 :     344726 :         struct mempolicy *pol = p->mempolicy;
     133                 :     344726 :         int node;
     134                 :            : 
     135   [ +  +  -  -  :     344726 :         if (pol)
             +  -  -  - ]
     136                 :            :                 return pol;
     137                 :            : 
     138   [ +  -  -  -  :     325984 :         node = numa_node_id();
             +  -  -  - ]
     139   [ +  -  -  -  :     325984 :         if (node != NUMA_NO_NODE) {
             +  -  -  - ]
     140                 :     325984 :                 pol = &preferred_node_policy[node];
     141                 :            :                 /* preferred_node_policy is not initialised early in boot */
     142   [ +  +  -  -  :     325984 :                 if (pol->mode)
             +  -  -  - ]
     143                 :     325810 :                         return pol;
     144                 :            :         }
     145                 :            : 
     146                 :            :         return &default_policy;
     147                 :            : }
     148                 :            : 
     149                 :            : static const struct mempolicy_operations {
     150                 :            :         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
     151                 :            :         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
     152                 :            : } mpol_ops[MPOL_MAX];
     153                 :            : 
     154                 :          3 : static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
     155                 :            : {
     156                 :          3 :         return pol->flags & MPOL_MODE_FLAGS;
     157                 :            : }
     158                 :            : 
     159                 :          0 : static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
     160                 :            :                                    const nodemask_t *rel)
     161                 :            : {
     162                 :          0 :         nodemask_t tmp;
     163                 :          0 :         nodes_fold(tmp, *orig, nodes_weight(*rel));
     164                 :          0 :         nodes_onto(*ret, tmp, *rel);
     165                 :          0 : }
     166                 :            : 
     167                 :          3 : static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
     168                 :            : {
     169         [ +  - ]:          3 :         if (nodes_empty(*nodes))
     170                 :            :                 return -EINVAL;
     171                 :          3 :         pol->v.nodes = *nodes;
     172                 :          3 :         return 0;
     173                 :            : }
     174                 :            : 
     175                 :          0 : static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
     176                 :            : {
     177         [ #  # ]:          0 :         if (!nodes)
     178                 :          0 :                 pol->flags |= MPOL_F_LOCAL;  /* local allocation */
     179         [ #  # ]:          0 :         else if (nodes_empty(*nodes))
     180                 :            :                 return -EINVAL;                 /*  no allowed nodes */
     181                 :            :         else
     182                 :          0 :                 pol->v.preferred_node = first_node(*nodes);
     183                 :            :         return 0;
     184                 :            : }
     185                 :            : 
     186                 :          0 : static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
     187                 :            : {
     188         [ #  # ]:          0 :         if (nodes_empty(*nodes))
     189                 :            :                 return -EINVAL;
     190                 :          0 :         pol->v.nodes = *nodes;
     191                 :          0 :         return 0;
     192                 :            : }
     193                 :            : 
     194                 :            : /*
     195                 :            :  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
     196                 :            :  * any, for the new policy.  mpol_new() has already validated the nodes
     197                 :            :  * parameter with respect to the policy mode and flags.  But, we need to
     198                 :            :  * handle an empty nodemask with MPOL_PREFERRED here.
     199                 :            :  *
     200                 :            :  * Must be called holding task's alloc_lock to protect task's mems_allowed
     201                 :            :  * and mempolicy.  May also be called holding the mmap_semaphore for write.
     202                 :            :  */
     203                 :          9 : static int mpol_set_nodemask(struct mempolicy *pol,
     204                 :            :                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
     205                 :            : {
     206                 :          9 :         int ret;
     207                 :            : 
     208                 :            :         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
     209         [ +  + ]:          9 :         if (pol == NULL)
     210                 :            :                 return 0;
     211                 :            :         /* Check N_MEMORY */
     212                 :          3 :         nodes_and(nsc->mask1,
     213                 :            :                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
     214                 :            : 
     215                 :          3 :         VM_BUG_ON(!nodes);
     216   [ -  +  -  - ]:          3 :         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
     217                 :            :                 nodes = NULL;   /* explicit local allocation */
     218                 :            :         else {
     219         [ -  + ]:          3 :                 if (pol->flags & MPOL_F_RELATIVE_NODES)
     220                 :          0 :                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
     221                 :            :                 else
     222                 :          3 :                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
     223                 :            : 
     224         [ -  + ]:          3 :                 if (mpol_store_user_nodemask(pol))
     225                 :          0 :                         pol->w.user_nodemask = *nodes;
     226                 :            :                 else
     227                 :          3 :                         pol->w.cpuset_mems_allowed =
     228                 :            :                                                 cpuset_current_mems_allowed;
     229                 :            :         }
     230                 :            : 
     231         [ +  - ]:          3 :         if (nodes)
     232                 :          3 :                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
     233                 :            :         else
     234                 :          0 :                 ret = mpol_ops[pol->mode].create(pol, NULL);
     235                 :            :         return ret;
     236                 :            : }
     237                 :            : 
     238                 :            : /*
     239                 :            :  * This function just creates a new policy, does some check and simple
     240                 :            :  * initialization. You must invoke mpol_set_nodemask() to set nodes.
     241                 :            :  */
     242                 :          9 : static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
     243                 :            :                                   nodemask_t *nodes)
     244                 :            : {
     245                 :          9 :         struct mempolicy *policy;
     246                 :            : 
     247                 :          9 :         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
     248                 :            :                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
     249                 :            : 
     250         [ +  + ]:          9 :         if (mode == MPOL_DEFAULT) {
     251   [ -  +  -  - ]:          6 :                 if (nodes && !nodes_empty(*nodes))
     252                 :            :                         return ERR_PTR(-EINVAL);
     253                 :          6 :                 return NULL;
     254                 :            :         }
     255                 :          3 :         VM_BUG_ON(!nodes);
     256                 :            : 
     257                 :            :         /*
     258                 :            :          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
     259                 :            :          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
     260                 :            :          * All other modes require a valid pointer to a non-empty nodemask.
     261                 :            :          */
     262         [ -  + ]:          3 :         if (mode == MPOL_PREFERRED) {
     263         [ #  # ]:          0 :                 if (nodes_empty(*nodes)) {
     264   [ #  #  #  # ]:          0 :                         if (((flags & MPOL_F_STATIC_NODES) ||
     265                 :            :                              (flags & MPOL_F_RELATIVE_NODES)))
     266                 :            :                                 return ERR_PTR(-EINVAL);
     267                 :            :                 }
     268         [ -  + ]:          3 :         } else if (mode == MPOL_LOCAL) {
     269   [ #  #  #  # ]:          0 :                 if (!nodes_empty(*nodes) ||
     270         [ #  # ]:          0 :                     (flags & MPOL_F_STATIC_NODES) ||
     271                 :            :                     (flags & MPOL_F_RELATIVE_NODES))
     272                 :            :                         return ERR_PTR(-EINVAL);
     273                 :            :                 mode = MPOL_PREFERRED;
     274         [ +  - ]:          3 :         } else if (nodes_empty(*nodes))
     275                 :            :                 return ERR_PTR(-EINVAL);
     276                 :          3 :         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
     277         [ +  - ]:          3 :         if (!policy)
     278                 :            :                 return ERR_PTR(-ENOMEM);
     279                 :          3 :         atomic_set(&policy->refcnt, 1);
     280                 :          3 :         policy->mode = mode;
     281                 :          3 :         policy->flags = flags;
     282                 :            : 
     283                 :          3 :         return policy;
     284                 :            : }
     285                 :            : 
     286                 :            : /* Slow path of a mpol destructor. */
     287                 :          6 : void __mpol_put(struct mempolicy *p)
     288                 :            : {
     289         [ +  - ]:          6 :         if (!atomic_dec_and_test(&p->refcnt))
     290                 :            :                 return;
     291                 :          6 :         kmem_cache_free(policy_cache, p);
     292                 :            : }
     293                 :            : 
     294                 :          0 : static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
     295                 :            : {
     296                 :          0 : }
     297                 :            : 
     298                 :          0 : static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
     299                 :            : {
     300                 :          0 :         nodemask_t tmp;
     301                 :            : 
     302         [ #  # ]:          0 :         if (pol->flags & MPOL_F_STATIC_NODES)
     303                 :          0 :                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
     304         [ #  # ]:          0 :         else if (pol->flags & MPOL_F_RELATIVE_NODES)
     305                 :          0 :                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
     306                 :            :         else {
     307                 :          0 :                 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
     308                 :            :                                                                 *nodes);
     309                 :          0 :                 pol->w.cpuset_mems_allowed = *nodes;
     310                 :            :         }
     311                 :            : 
     312         [ #  # ]:          0 :         if (nodes_empty(tmp))
     313                 :          0 :                 tmp = *nodes;
     314                 :            : 
     315                 :          0 :         pol->v.nodes = tmp;
     316                 :          0 : }
     317                 :            : 
     318                 :          0 : static void mpol_rebind_preferred(struct mempolicy *pol,
     319                 :            :                                                 const nodemask_t *nodes)
     320                 :            : {
     321                 :          0 :         nodemask_t tmp;
     322                 :            : 
     323         [ #  # ]:          0 :         if (pol->flags & MPOL_F_STATIC_NODES) {
     324                 :          0 :                 int node = first_node(pol->w.user_nodemask);
     325                 :            : 
     326         [ #  # ]:          0 :                 if (node_isset(node, *nodes)) {
     327                 :          0 :                         pol->v.preferred_node = node;
     328                 :          0 :                         pol->flags &= ~MPOL_F_LOCAL;
     329                 :            :                 } else
     330                 :          0 :                         pol->flags |= MPOL_F_LOCAL;
     331         [ #  # ]:          0 :         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
     332                 :          0 :                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
     333                 :          0 :                 pol->v.preferred_node = first_node(tmp);
     334         [ #  # ]:          0 :         } else if (!(pol->flags & MPOL_F_LOCAL)) {
     335                 :          0 :                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
     336                 :            :                                                    pol->w.cpuset_mems_allowed,
     337                 :            :                                                    *nodes);
     338                 :          0 :                 pol->w.cpuset_mems_allowed = *nodes;
     339                 :            :         }
     340                 :          0 : }
     341                 :            : 
     342                 :            : /*
     343                 :            :  * mpol_rebind_policy - Migrate a policy to a different set of nodes
     344                 :            :  *
     345                 :            :  * Per-vma policies are protected by mmap_sem. Allocations using per-task
     346                 :            :  * policies are protected by task->mems_allowed_seq to prevent a premature
     347                 :            :  * OOM/allocation failure due to parallel nodemask modification.
     348                 :            :  */
     349                 :          0 : static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
     350                 :            : {
     351         [ #  # ]:          0 :         if (!pol)
     352                 :            :                 return;
     353   [ #  #  #  #  :          0 :         if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
                   #  # ]
     354                 :            :             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
     355                 :            :                 return;
     356                 :            : 
     357                 :          0 :         mpol_ops[pol->mode].rebind(pol, newmask);
     358                 :            : }
     359                 :            : 
     360                 :            : /*
     361                 :            :  * Wrapper for mpol_rebind_policy() that just requires task
     362                 :            :  * pointer, and updates task mempolicy.
     363                 :            :  *
     364                 :            :  * Called with task's alloc_lock held.
     365                 :            :  */
     366                 :            : 
     367                 :          0 : void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
     368                 :            : {
     369                 :          0 :         mpol_rebind_policy(tsk->mempolicy, new);
     370                 :          0 : }
     371                 :            : 
     372                 :            : /*
     373                 :            :  * Rebind each vma in mm to new nodemask.
     374                 :            :  *
     375                 :            :  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
     376                 :            :  */
     377                 :            : 
     378                 :          0 : void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
     379                 :            : {
     380                 :          0 :         struct vm_area_struct *vma;
     381                 :            : 
     382                 :          0 :         down_write(&mm->mmap_sem);
     383         [ #  # ]:          0 :         for (vma = mm->mmap; vma; vma = vma->vm_next)
     384                 :          0 :                 mpol_rebind_policy(vma->vm_policy, new);
     385                 :          0 :         up_write(&mm->mmap_sem);
     386                 :          0 : }
     387                 :            : 
     388                 :            : static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
     389                 :            :         [MPOL_DEFAULT] = {
     390                 :            :                 .rebind = mpol_rebind_default,
     391                 :            :         },
     392                 :            :         [MPOL_INTERLEAVE] = {
     393                 :            :                 .create = mpol_new_interleave,
     394                 :            :                 .rebind = mpol_rebind_nodemask,
     395                 :            :         },
     396                 :            :         [MPOL_PREFERRED] = {
     397                 :            :                 .create = mpol_new_preferred,
     398                 :            :                 .rebind = mpol_rebind_preferred,
     399                 :            :         },
     400                 :            :         [MPOL_BIND] = {
     401                 :            :                 .create = mpol_new_bind,
     402                 :            :                 .rebind = mpol_rebind_nodemask,
     403                 :            :         },
     404                 :            : };
     405                 :            : 
     406                 :            : static int migrate_page_add(struct page *page, struct list_head *pagelist,
     407                 :            :                                 unsigned long flags);
     408                 :            : 
     409                 :            : struct queue_pages {
     410                 :            :         struct list_head *pagelist;
     411                 :            :         unsigned long flags;
     412                 :            :         nodemask_t *nmask;
     413                 :            :         unsigned long start;
     414                 :            :         unsigned long end;
     415                 :            :         struct vm_area_struct *first;
     416                 :            : };
     417                 :            : 
     418                 :            : /*
     419                 :            :  * Check if the page's nid is in qp->nmask.
     420                 :            :  *
     421                 :            :  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
     422                 :            :  * in the invert of qp->nmask.
     423                 :            :  */
     424                 :          0 : static inline bool queue_pages_required(struct page *page,
     425                 :            :                                         struct queue_pages *qp)
     426                 :            : {
     427                 :          0 :         int nid = page_to_nid(page);
     428                 :          0 :         unsigned long flags = qp->flags;
     429                 :            : 
     430                 :          0 :         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
     431                 :            : }
     432                 :            : 
     433                 :            : /*
     434                 :            :  * queue_pages_pmd() has four possible return values:
     435                 :            :  * 0 - pages are placed on the right node or queued successfully.
     436                 :            :  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
     437                 :            :  *     specified.
     438                 :            :  * 2 - THP was split.
     439                 :            :  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
     440                 :            :  *        existing page was already on a node that does not follow the
     441                 :            :  *        policy.
     442                 :            :  */
     443                 :            : static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
     444                 :            :                                 unsigned long end, struct mm_walk *walk)
     445                 :            : {
     446                 :            :         int ret = 0;
     447                 :            :         struct page *page;
     448                 :            :         struct queue_pages *qp = walk->private;
     449                 :            :         unsigned long flags;
     450                 :            : 
     451                 :            :         if (unlikely(is_pmd_migration_entry(*pmd))) {
     452                 :            :                 ret = -EIO;
     453                 :            :                 goto unlock;
     454                 :            :         }
     455                 :            :         page = pmd_page(*pmd);
     456                 :            :         if (is_huge_zero_page(page)) {
     457                 :            :                 spin_unlock(ptl);
     458                 :            :                 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
     459                 :            :                 ret = 2;
     460                 :            :                 goto out;
     461                 :            :         }
     462                 :            :         if (!queue_pages_required(page, qp))
     463                 :            :                 goto unlock;
     464                 :            : 
     465                 :            :         flags = qp->flags;
     466                 :            :         /* go to thp migration */
     467                 :            :         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
     468                 :            :                 if (!vma_migratable(walk->vma) ||
     469                 :            :                     migrate_page_add(page, qp->pagelist, flags)) {
     470                 :            :                         ret = 1;
     471                 :            :                         goto unlock;
     472                 :            :                 }
     473                 :            :         } else
     474                 :            :                 ret = -EIO;
     475                 :            : unlock:
     476                 :            :         spin_unlock(ptl);
     477                 :            : out:
     478                 :            :         return ret;
     479                 :            : }
     480                 :            : 
     481                 :            : /*
     482                 :            :  * Scan through pages checking if pages follow certain conditions,
     483                 :            :  * and move them to the pagelist if they do.
     484                 :            :  *
     485                 :            :  * queue_pages_pte_range() has three possible return values:
     486                 :            :  * 0 - pages are placed on the right node or queued successfully.
     487                 :            :  * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
     488                 :            :  *     specified.
     489                 :            :  * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
     490                 :            :  *        on a node that does not follow the policy.
     491                 :            :  */
     492                 :          0 : static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
     493                 :            :                         unsigned long end, struct mm_walk *walk)
     494                 :            : {
     495                 :          0 :         struct vm_area_struct *vma = walk->vma;
     496                 :          0 :         struct page *page;
     497                 :          0 :         struct queue_pages *qp = walk->private;
     498                 :          0 :         unsigned long flags = qp->flags;
     499                 :          0 :         int ret;
     500                 :          0 :         bool has_unmovable = false;
     501                 :          0 :         pte_t *pte;
     502                 :          0 :         spinlock_t *ptl;
     503                 :            : 
     504         [ #  # ]:          0 :         ptl = pmd_trans_huge_lock(pmd, vma);
     505                 :          0 :         if (ptl) {
     506                 :            :                 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
     507                 :            :                 if (ret != 2)
     508                 :            :                         return ret;
     509                 :            :         }
     510                 :            :         /* THP was split, fall through to pte walk */
     511                 :            : 
     512         [ #  # ]:          0 :         if (pmd_trans_unstable(pmd))
     513                 :            :                 return 0;
     514                 :            : 
     515         [ #  # ]:          0 :         pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
     516         [ #  # ]:          0 :         for (; addr != end; pte++, addr += PAGE_SIZE) {
     517         [ #  # ]:          0 :                 if (!pte_present(*pte))
     518                 :          0 :                         continue;
     519                 :          0 :                 page = vm_normal_page(vma, addr, *pte);
     520         [ #  # ]:          0 :                 if (!page)
     521                 :          0 :                         continue;
     522                 :            :                 /*
     523                 :            :                  * vm_normal_page() filters out zero pages, but there might
     524                 :            :                  * still be PageReserved pages to skip, perhaps in a VDSO.
     525                 :            :                  */
     526         [ #  # ]:          0 :                 if (PageReserved(page))
     527                 :          0 :                         continue;
     528         [ #  # ]:          0 :                 if (!queue_pages_required(page, qp))
     529                 :          0 :                         continue;
     530         [ #  # ]:          0 :                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
     531                 :            :                         /* MPOL_MF_STRICT must be specified if we get here */
     532         [ #  # ]:          0 :                         if (!vma_migratable(vma)) {
     533                 :            :                                 has_unmovable = true;
     534                 :            :                                 break;
     535                 :            :                         }
     536                 :            : 
     537                 :            :                         /*
     538                 :            :                          * Do not abort immediately since there may be
     539                 :            :                          * temporary off LRU pages in the range.  Still
     540                 :            :                          * need migrate other LRU pages.
     541                 :            :                          */
     542         [ #  # ]:          0 :                         if (migrate_page_add(page, qp->pagelist, flags))
     543                 :          0 :                                 has_unmovable = true;
     544                 :            :                 } else
     545                 :            :                         break;
     546                 :            :         }
     547                 :          0 :         pte_unmap_unlock(pte - 1, ptl);
     548                 :          0 :         cond_resched();
     549                 :            : 
     550         [ #  # ]:          0 :         if (has_unmovable)
     551                 :            :                 return 1;
     552                 :            : 
     553         [ #  # ]:          0 :         return addr != end ? -EIO : 0;
     554                 :            : }
     555                 :            : 
     556                 :          0 : static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
     557                 :            :                                unsigned long addr, unsigned long end,
     558                 :            :                                struct mm_walk *walk)
     559                 :            : {
     560                 :            : #ifdef CONFIG_HUGETLB_PAGE
     561                 :          0 :         struct queue_pages *qp = walk->private;
     562                 :          0 :         unsigned long flags = qp->flags;
     563                 :          0 :         struct page *page;
     564                 :          0 :         spinlock_t *ptl;
     565                 :          0 :         pte_t entry;
     566                 :            : 
     567                 :          0 :         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
     568         [ #  # ]:          0 :         entry = huge_ptep_get(pte);
     569         [ #  # ]:          0 :         if (!pte_present(entry))
     570                 :          0 :                 goto unlock;
     571         [ #  # ]:          0 :         page = pte_page(entry);
     572         [ #  # ]:          0 :         if (!queue_pages_required(page, qp))
     573                 :          0 :                 goto unlock;
     574                 :            :         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
     575         [ #  # ]:          0 :         if (flags & (MPOL_MF_MOVE_ALL) ||
     576   [ #  #  #  # ]:          0 :             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
     577                 :          0 :                 isolate_huge_page(page, qp->pagelist);
     578                 :          0 : unlock:
     579                 :          0 :         spin_unlock(ptl);
     580                 :            : #else
     581                 :            :         BUG();
     582                 :            : #endif
     583                 :          0 :         return 0;
     584                 :            : }
     585                 :            : 
     586                 :            : #ifdef CONFIG_NUMA_BALANCING
     587                 :            : /*
     588                 :            :  * This is used to mark a range of virtual addresses to be inaccessible.
     589                 :            :  * These are later cleared by a NUMA hinting fault. Depending on these
     590                 :            :  * faults, pages may be migrated for better NUMA placement.
     591                 :            :  *
     592                 :            :  * This is assuming that NUMA faults are handled using PROT_NONE. If
     593                 :            :  * an architecture makes a different choice, it will need further
     594                 :            :  * changes to the core.
     595                 :            :  */
     596                 :            : unsigned long change_prot_numa(struct vm_area_struct *vma,
     597                 :            :                         unsigned long addr, unsigned long end)
     598                 :            : {
     599                 :            :         int nr_updated;
     600                 :            : 
     601                 :            :         nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
     602                 :            :         if (nr_updated)
     603                 :            :                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
     604                 :            : 
     605                 :            :         return nr_updated;
     606                 :            : }
     607                 :            : #else
     608                 :            : static unsigned long change_prot_numa(struct vm_area_struct *vma,
     609                 :            :                         unsigned long addr, unsigned long end)
     610                 :            : {
     611                 :            :         return 0;
     612                 :            : }
     613                 :            : #endif /* CONFIG_NUMA_BALANCING */
     614                 :            : 
     615                 :          0 : static int queue_pages_test_walk(unsigned long start, unsigned long end,
     616                 :            :                                 struct mm_walk *walk)
     617                 :            : {
     618                 :          0 :         struct vm_area_struct *vma = walk->vma;
     619                 :          0 :         struct queue_pages *qp = walk->private;
     620                 :          0 :         unsigned long endvma = vma->vm_end;
     621                 :          0 :         unsigned long flags = qp->flags;
     622                 :            : 
     623                 :            :         /* range check first */
     624                 :          0 :         VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
     625                 :            : 
     626         [ #  # ]:          0 :         if (!qp->first) {
     627                 :          0 :                 qp->first = vma;
     628         [ #  # ]:          0 :                 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
     629         [ #  # ]:          0 :                         (qp->start < vma->vm_start))
     630                 :            :                         /* hole at head side of range */
     631                 :            :                         return -EFAULT;
     632                 :            :         }
     633         [ #  # ]:          0 :         if (!(flags & MPOL_MF_DISCONTIG_OK) &&
     634         [ #  # ]:          0 :                 ((vma->vm_end < qp->end) &&
     635   [ #  #  #  # ]:          0 :                 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
     636                 :            :                 /* hole at middle or tail of range */
     637                 :            :                 return -EFAULT;
     638                 :            : 
     639                 :            :         /*
     640                 :            :          * Need check MPOL_MF_STRICT to return -EIO if possible
     641                 :            :          * regardless of vma_migratable
     642                 :            :          */
     643         [ #  # ]:          0 :         if (!vma_migratable(vma) &&
     644         [ #  # ]:          0 :             !(flags & MPOL_MF_STRICT))
     645                 :            :                 return 1;
     646                 :            : 
     647                 :          0 :         if (endvma > end)
     648                 :            :                 endvma = end;
     649                 :            : 
     650         [ #  # ]:          0 :         if (flags & MPOL_MF_LAZY) {
     651                 :            :                 /* Similar to task_numa_work, skip inaccessible VMAs */
     652                 :            :                 if (!is_vm_hugetlb_page(vma) &&
     653                 :            :                         (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
     654                 :            :                         !(vma->vm_flags & VM_MIXEDMAP))
     655                 :            :                         change_prot_numa(vma, start, endvma);
     656                 :            :                 return 1;
     657                 :            :         }
     658                 :            : 
     659                 :            :         /* queue pages from current vma */
     660         [ #  # ]:          0 :         if (flags & MPOL_MF_VALID)
     661                 :          0 :                 return 0;
     662                 :            :         return 1;
     663                 :            : }
     664                 :            : 
     665                 :            : static const struct mm_walk_ops queue_pages_walk_ops = {
     666                 :            :         .hugetlb_entry          = queue_pages_hugetlb,
     667                 :            :         .pmd_entry              = queue_pages_pte_range,
     668                 :            :         .test_walk              = queue_pages_test_walk,
     669                 :            : };
     670                 :            : 
     671                 :            : /*
     672                 :            :  * Walk through page tables and collect pages to be migrated.
     673                 :            :  *
     674                 :            :  * If pages found in a given range are on a set of nodes (determined by
     675                 :            :  * @nodes and @flags,) it's isolated and queued to the pagelist which is
     676                 :            :  * passed via @private.
     677                 :            :  *
     678                 :            :  * queue_pages_range() has three possible return values:
     679                 :            :  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
     680                 :            :  *     specified.
     681                 :            :  * 0 - queue pages successfully or no misplaced page.
     682                 :            :  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
     683                 :            :  *         memory range specified by nodemask and maxnode points outside
     684                 :            :  *         your accessible address space (-EFAULT)
     685                 :            :  */
     686                 :            : static int
     687                 :          0 : queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
     688                 :            :                 nodemask_t *nodes, unsigned long flags,
     689                 :            :                 struct list_head *pagelist)
     690                 :            : {
     691                 :          0 :         int err;
     692                 :          0 :         struct queue_pages qp = {
     693                 :            :                 .pagelist = pagelist,
     694                 :            :                 .flags = flags,
     695                 :            :                 .nmask = nodes,
     696                 :            :                 .start = start,
     697                 :            :                 .end = end,
     698                 :            :                 .first = NULL,
     699                 :            :         };
     700                 :            : 
     701                 :          0 :         err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
     702                 :            : 
     703         [ #  # ]:          0 :         if (!qp.first)
     704                 :            :                 /* whole range in hole */
     705                 :          0 :                 err = -EFAULT;
     706                 :            : 
     707                 :          0 :         return err;
     708                 :            : }
     709                 :            : 
     710                 :            : /*
     711                 :            :  * Apply policy to a single VMA
     712                 :            :  * This must be called with the mmap_sem held for writing.
     713                 :            :  */
     714                 :          0 : static int vma_replace_policy(struct vm_area_struct *vma,
     715                 :            :                                                 struct mempolicy *pol)
     716                 :            : {
     717                 :          0 :         int err;
     718                 :          0 :         struct mempolicy *old;
     719                 :          0 :         struct mempolicy *new;
     720                 :            : 
     721                 :          0 :         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
     722                 :            :                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
     723                 :            :                  vma->vm_ops, vma->vm_file,
     724                 :            :                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
     725                 :            : 
     726         [ #  # ]:          0 :         new = mpol_dup(pol);
     727         [ #  # ]:          0 :         if (IS_ERR(new))
     728                 :          0 :                 return PTR_ERR(new);
     729                 :            : 
     730   [ #  #  #  # ]:          0 :         if (vma->vm_ops && vma->vm_ops->set_policy) {
     731                 :          0 :                 err = vma->vm_ops->set_policy(vma, new);
     732         [ #  # ]:          0 :                 if (err)
     733                 :          0 :                         goto err_out;
     734                 :            :         }
     735                 :            : 
     736                 :          0 :         old = vma->vm_policy;
     737                 :          0 :         vma->vm_policy = new; /* protected by mmap_sem */
     738         [ #  # ]:          0 :         mpol_put(old);
     739                 :            : 
     740                 :            :         return 0;
     741                 :            :  err_out:
     742         [ #  # ]:          0 :         mpol_put(new);
     743                 :            :         return err;
     744                 :            : }
     745                 :            : 
     746                 :            : /* Step 2: apply policy to a range and do splits. */
     747                 :          0 : static int mbind_range(struct mm_struct *mm, unsigned long start,
     748                 :            :                        unsigned long end, struct mempolicy *new_pol)
     749                 :            : {
     750                 :          0 :         struct vm_area_struct *next;
     751                 :          0 :         struct vm_area_struct *prev;
     752                 :          0 :         struct vm_area_struct *vma;
     753                 :          0 :         int err = 0;
     754                 :          0 :         pgoff_t pgoff;
     755                 :          0 :         unsigned long vmstart;
     756                 :          0 :         unsigned long vmend;
     757                 :            : 
     758                 :          0 :         vma = find_vma(mm, start);
     759                 :          0 :         VM_BUG_ON(!vma);
     760                 :            : 
     761                 :          0 :         prev = vma->vm_prev;
     762         [ #  # ]:          0 :         if (start > vma->vm_start)
     763                 :          0 :                 prev = vma;
     764                 :            : 
     765   [ #  #  #  # ]:          0 :         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
     766                 :          0 :                 next = vma->vm_next;
     767                 :          0 :                 vmstart = max(start, vma->vm_start);
     768                 :          0 :                 vmend   = min(end, vma->vm_end);
     769                 :            : 
     770   [ #  #  #  # ]:          0 :                 if (mpol_equal(vma_policy(vma), new_pol))
     771                 :          0 :                         continue;
     772                 :            : 
     773                 :          0 :                 pgoff = vma->vm_pgoff +
     774                 :          0 :                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
     775                 :          0 :                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
     776                 :            :                                  vma->anon_vma, vma->vm_file, pgoff,
     777                 :            :                                  new_pol, vma->vm_userfaultfd_ctx);
     778         [ #  # ]:          0 :                 if (prev) {
     779                 :          0 :                         vma = prev;
     780                 :          0 :                         next = vma->vm_next;
     781   [ #  #  #  # ]:          0 :                         if (mpol_equal(vma_policy(vma), new_pol))
     782                 :          0 :                                 continue;
     783                 :            :                         /* vma_merge() joined vma && vma->next, case 8 */
     784                 :          0 :                         goto replace;
     785                 :            :                 }
     786         [ #  # ]:          0 :                 if (vma->vm_start != vmstart) {
     787                 :          0 :                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
     788         [ #  # ]:          0 :                         if (err)
     789                 :          0 :                                 goto out;
     790                 :            :                 }
     791         [ #  # ]:          0 :                 if (vma->vm_end != vmend) {
     792                 :          0 :                         err = split_vma(vma->vm_mm, vma, vmend, 0);
     793         [ #  # ]:          0 :                         if (err)
     794                 :          0 :                                 goto out;
     795                 :            :                 }
     796                 :          0 :  replace:
     797                 :          0 :                 err = vma_replace_policy(vma, new_pol);
     798         [ #  # ]:          0 :                 if (err)
     799                 :          0 :                         goto out;
     800                 :            :         }
     801                 :            : 
     802                 :          0 :  out:
     803                 :          0 :         return err;
     804                 :            : }
     805                 :            : 
     806                 :            : /* Set the process memory policy */
     807                 :          9 : static long do_set_mempolicy(unsigned short mode, unsigned short flags,
     808                 :            :                              nodemask_t *nodes)
     809                 :            : {
     810                 :          9 :         struct mempolicy *new, *old;
     811                 :          9 :         NODEMASK_SCRATCH(scratch);
     812                 :          9 :         int ret;
     813                 :            : 
     814                 :          9 :         if (!scratch)
     815                 :            :                 return -ENOMEM;
     816                 :            : 
     817                 :          9 :         new = mpol_new(mode, flags, nodes);
     818         [ -  + ]:          9 :         if (IS_ERR(new)) {
     819                 :          0 :                 ret = PTR_ERR(new);
     820                 :          0 :                 goto out;
     821                 :            :         }
     822                 :            : 
     823                 :          9 :         task_lock(current);
     824                 :          9 :         ret = mpol_set_nodemask(new, nodes, scratch);
     825         [ -  + ]:          9 :         if (ret) {
     826                 :          0 :                 task_unlock(current);
     827         [ #  # ]:          0 :                 mpol_put(new);
     828                 :          0 :                 goto out;
     829                 :            :         }
     830         [ +  + ]:          9 :         old = current->mempolicy;
     831                 :          9 :         current->mempolicy = new;
     832   [ +  +  +  - ]:          9 :         if (new && new->mode == MPOL_INTERLEAVE)
     833                 :          3 :                 current->il_prev = MAX_NUMNODES-1;
     834                 :          9 :         task_unlock(current);
     835         [ +  + ]:          9 :         mpol_put(old);
     836                 :            :         ret = 0;
     837                 :          9 : out:
     838                 :          9 :         NODEMASK_SCRATCH_FREE(scratch);
     839                 :          9 :         return ret;
     840                 :            : }
     841                 :            : 
     842                 :            : /*
     843                 :            :  * Return nodemask for policy for get_mempolicy() query
     844                 :            :  *
     845                 :            :  * Called with task's alloc_lock held
     846                 :            :  */
     847                 :          0 : static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
     848                 :            : {
     849         [ #  # ]:          0 :         nodes_clear(*nodes);
     850         [ #  # ]:          0 :         if (p == &default_policy)
     851                 :            :                 return;
     852                 :            : 
     853      [ #  #  # ]:          0 :         switch (p->mode) {
     854                 :          0 :         case MPOL_BIND:
     855                 :            :                 /* Fall through */
     856                 :            :         case MPOL_INTERLEAVE:
     857                 :          0 :                 *nodes = p->v.nodes;
     858                 :          0 :                 break;
     859                 :          0 :         case MPOL_PREFERRED:
     860         [ #  # ]:          0 :                 if (!(p->flags & MPOL_F_LOCAL))
     861                 :          0 :                         node_set(p->v.preferred_node, *nodes);
     862                 :            :                 /* else return empty node mask for local allocation */
     863                 :            :                 break;
     864                 :          0 :         default:
     865                 :          0 :                 BUG();
     866                 :            :         }
     867                 :            : }
     868                 :            : 
     869                 :          0 : static int lookup_node(struct mm_struct *mm, unsigned long addr)
     870                 :            : {
     871                 :          0 :         struct page *p;
     872                 :          0 :         int err;
     873                 :            : 
     874                 :          0 :         int locked = 1;
     875                 :          0 :         err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
     876         [ #  # ]:          0 :         if (err >= 0) {
     877                 :          0 :                 err = page_to_nid(p);
     878                 :          0 :                 put_page(p);
     879                 :            :         }
     880         [ #  # ]:          0 :         if (locked)
     881                 :          0 :                 up_read(&mm->mmap_sem);
     882                 :          0 :         return err;
     883                 :            : }
     884                 :            : 
     885                 :            : /* Retrieve NUMA policy */
     886                 :          0 : static long do_get_mempolicy(int *policy, nodemask_t *nmask,
     887                 :            :                              unsigned long addr, unsigned long flags)
     888                 :            : {
     889                 :          0 :         int err;
     890         [ #  # ]:          0 :         struct mm_struct *mm = current->mm;
     891                 :          0 :         struct vm_area_struct *vma = NULL;
     892                 :          0 :         struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
     893                 :            : 
     894         [ #  # ]:          0 :         if (flags &
     895                 :            :                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
     896                 :            :                 return -EINVAL;
     897                 :            : 
     898         [ #  # ]:          0 :         if (flags & MPOL_F_MEMS_ALLOWED) {
     899         [ #  # ]:          0 :                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
     900                 :            :                         return -EINVAL;
     901                 :          0 :                 *policy = 0;    /* just so it's initialized */
     902                 :          0 :                 task_lock(current);
     903                 :          0 :                 *nmask  = cpuset_current_mems_allowed;
     904                 :          0 :                 task_unlock(current);
     905                 :          0 :                 return 0;
     906                 :            :         }
     907                 :            : 
     908         [ #  # ]:          0 :         if (flags & MPOL_F_ADDR) {
     909                 :            :                 /*
     910                 :            :                  * Do NOT fall back to task policy if the
     911                 :            :                  * vma/shared policy at addr is NULL.  We
     912                 :            :                  * want to return MPOL_DEFAULT in this case.
     913                 :            :                  */
     914                 :          0 :                 down_read(&mm->mmap_sem);
     915                 :          0 :                 vma = find_vma_intersection(mm, addr, addr+1);
     916         [ #  # ]:          0 :                 if (!vma) {
     917                 :          0 :                         up_read(&mm->mmap_sem);
     918                 :          0 :                         return -EFAULT;
     919                 :            :                 }
     920   [ #  #  #  # ]:          0 :                 if (vma->vm_ops && vma->vm_ops->get_policy)
     921                 :          0 :                         pol = vma->vm_ops->get_policy(vma, addr);
     922                 :            :                 else
     923                 :          0 :                         pol = vma->vm_policy;
     924         [ #  # ]:          0 :         } else if (addr)
     925                 :            :                 return -EINVAL;
     926                 :            : 
     927         [ #  # ]:          0 :         if (!pol)
     928                 :          0 :                 pol = &default_policy;      /* indicates default behavior */
     929                 :            : 
     930         [ #  # ]:          0 :         if (flags & MPOL_F_NODE) {
     931         [ #  # ]:          0 :                 if (flags & MPOL_F_ADDR) {
     932                 :            :                         /*
     933                 :            :                          * Take a refcount on the mpol, lookup_node()
     934                 :            :                          * wil drop the mmap_sem, so after calling
     935                 :            :                          * lookup_node() only "pol" remains valid, "vma"
     936                 :            :                          * is stale.
     937                 :            :                          */
     938                 :          0 :                         pol_refcount = pol;
     939                 :          0 :                         vma = NULL;
     940         [ #  # ]:          0 :                         mpol_get(pol);
     941                 :          0 :                         err = lookup_node(mm, addr);
     942         [ #  # ]:          0 :                         if (err < 0)
     943                 :          0 :                                 goto out;
     944                 :          0 :                         *policy = err;
     945         [ #  # ]:          0 :                 } else if (pol == current->mempolicy &&
     946         [ #  # ]:          0 :                                 pol->mode == MPOL_INTERLEAVE) {
     947                 :          0 :                         *policy = next_node_in(current->il_prev, pol->v.nodes);
     948                 :            :                 } else {
     949                 :          0 :                         err = -EINVAL;
     950                 :          0 :                         goto out;
     951                 :            :                 }
     952                 :            :         } else {
     953         [ #  # ]:          0 :                 *policy = pol == &default_policy ? MPOL_DEFAULT :
     954                 :          0 :                                                 pol->mode;
     955                 :            :                 /*
     956                 :            :                  * Internal mempolicy flags must be masked off before exposing
     957                 :            :                  * the policy to userspace.
     958                 :            :                  */
     959                 :          0 :                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
     960                 :            :         }
     961                 :            : 
     962                 :          0 :         err = 0;
     963         [ #  # ]:          0 :         if (nmask) {
     964         [ #  # ]:          0 :                 if (mpol_store_user_nodemask(pol)) {
     965                 :          0 :                         *nmask = pol->w.user_nodemask;
     966                 :            :                 } else {
     967                 :          0 :                         task_lock(current);
     968                 :          0 :                         get_policy_nodemask(pol, nmask);
     969                 :          0 :                         task_unlock(current);
     970                 :            :                 }
     971                 :            :         }
     972                 :            : 
     973                 :          0 :  out:
     974         [ #  # ]:          0 :         mpol_cond_put(pol);
     975         [ #  # ]:          0 :         if (vma)
     976                 :          0 :                 up_read(&mm->mmap_sem);
     977         [ #  # ]:          0 :         if (pol_refcount)
     978                 :          0 :                 mpol_put(pol_refcount);
     979                 :          0 :         return err;
     980                 :            : }
     981                 :            : 
     982                 :            : #ifdef CONFIG_MIGRATION
     983                 :            : /*
     984                 :            :  * page migration, thp tail pages can be passed.
     985                 :            :  */
     986                 :          0 : static int migrate_page_add(struct page *page, struct list_head *pagelist,
     987                 :            :                                 unsigned long flags)
     988                 :            : {
     989         [ #  # ]:          0 :         struct page *head = compound_head(page);
     990                 :            :         /*
     991                 :            :          * Avoid migrating a page that is shared with others.
     992                 :            :          */
     993   [ #  #  #  # ]:          0 :         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
     994         [ #  # ]:          0 :                 if (!isolate_lru_page(head)) {
     995                 :          0 :                         list_add_tail(&head->lru, pagelist);
     996                 :          0 :                         mod_node_page_state(page_pgdat(head),
     997                 :          0 :                                 NR_ISOLATED_ANON + page_is_file_cache(head),
     998                 :            :                                 hpage_nr_pages(head));
     999         [ #  # ]:          0 :                 } else if (flags & MPOL_MF_STRICT) {
    1000                 :            :                         /*
    1001                 :            :                          * Non-movable page may reach here.  And, there may be
    1002                 :            :                          * temporary off LRU pages or non-LRU movable pages.
    1003                 :            :                          * Treat them as unmovable pages since they can't be
    1004                 :            :                          * isolated, so they can't be moved at the moment.  It
    1005                 :            :                          * should return -EIO for this case too.
    1006                 :            :                          */
    1007                 :          0 :                         return -EIO;
    1008                 :            :                 }
    1009                 :            :         }
    1010                 :            : 
    1011                 :            :         return 0;
    1012                 :            : }
    1013                 :            : 
    1014                 :            : /* page allocation callback for NUMA node migration */
    1015                 :          0 : struct page *alloc_new_node_page(struct page *page, unsigned long node)
    1016                 :            : {
    1017         [ #  # ]:          0 :         if (PageHuge(page))
    1018         [ #  # ]:          0 :                 return alloc_huge_page_node(page_hstate(compound_head(page)),
    1019                 :            :                                         node);
    1020                 :          0 :         else if (PageTransHuge(page)) {
    1021                 :            :                 struct page *thp;
    1022                 :            : 
    1023                 :            :                 thp = alloc_pages_node(node,
    1024                 :            :                         (GFP_TRANSHUGE | __GFP_THISNODE),
    1025                 :            :                         HPAGE_PMD_ORDER);
    1026                 :            :                 if (!thp)
    1027                 :            :                         return NULL;
    1028                 :            :                 prep_transhuge_page(thp);
    1029                 :            :                 return thp;
    1030                 :            :         } else
    1031                 :          0 :                 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
    1032                 :            :                                                     __GFP_THISNODE, 0);
    1033                 :            : }
    1034                 :            : 
    1035                 :            : /*
    1036                 :            :  * Migrate pages from one node to a target node.
    1037                 :            :  * Returns error or the number of pages not migrated.
    1038                 :            :  */
    1039                 :          0 : static int migrate_to_node(struct mm_struct *mm, int source, int dest,
    1040                 :            :                            int flags)
    1041                 :            : {
    1042                 :          0 :         nodemask_t nmask;
    1043                 :          0 :         LIST_HEAD(pagelist);
    1044                 :          0 :         int err = 0;
    1045                 :            : 
    1046                 :          0 :         nodes_clear(nmask);
    1047                 :          0 :         node_set(source, nmask);
    1048                 :            : 
    1049                 :            :         /*
    1050                 :            :          * This does not "check" the range but isolates all pages that
    1051                 :            :          * need migration.  Between passing in the full user address
    1052                 :            :          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
    1053                 :            :          */
    1054                 :          0 :         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
    1055                 :          0 :         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
    1056                 :          0 :                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
    1057                 :            : 
    1058         [ #  # ]:          0 :         if (!list_empty(&pagelist)) {
    1059                 :          0 :                 err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
    1060                 :            :                                         MIGRATE_SYNC, MR_SYSCALL);
    1061         [ #  # ]:          0 :                 if (err)
    1062                 :          0 :                         putback_movable_pages(&pagelist);
    1063                 :            :         }
    1064                 :            : 
    1065                 :          0 :         return err;
    1066                 :            : }
    1067                 :            : 
    1068                 :            : /*
    1069                 :            :  * Move pages between the two nodesets so as to preserve the physical
    1070                 :            :  * layout as much as possible.
    1071                 :            :  *
    1072                 :            :  * Returns the number of page that could not be moved.
    1073                 :            :  */
    1074                 :          0 : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
    1075                 :            :                      const nodemask_t *to, int flags)
    1076                 :            : {
    1077                 :          0 :         int busy = 0;
    1078                 :          0 :         int err;
    1079                 :          0 :         nodemask_t tmp;
    1080                 :            : 
    1081                 :          0 :         err = migrate_prep();
    1082         [ #  # ]:          0 :         if (err)
    1083                 :            :                 return err;
    1084                 :            : 
    1085                 :          0 :         down_read(&mm->mmap_sem);
    1086                 :            : 
    1087                 :            :         /*
    1088                 :            :          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
    1089                 :            :          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
    1090                 :            :          * bit in 'tmp', and return that <source, dest> pair for migration.
    1091                 :            :          * The pair of nodemasks 'to' and 'from' define the map.
    1092                 :            :          *
    1093                 :            :          * If no pair of bits is found that way, fallback to picking some
    1094                 :            :          * pair of 'source' and 'dest' bits that are not the same.  If the
    1095                 :            :          * 'source' and 'dest' bits are the same, this represents a node
    1096                 :            :          * that will be migrating to itself, so no pages need move.
    1097                 :            :          *
    1098                 :            :          * If no bits are left in 'tmp', or if all remaining bits left
    1099                 :            :          * in 'tmp' correspond to the same bit in 'to', return false
    1100                 :            :          * (nothing left to migrate).
    1101                 :            :          *
    1102                 :            :          * This lets us pick a pair of nodes to migrate between, such that
    1103                 :            :          * if possible the dest node is not already occupied by some other
    1104                 :            :          * source node, minimizing the risk of overloading the memory on a
    1105                 :            :          * node that would happen if we migrated incoming memory to a node
    1106                 :            :          * before migrating outgoing memory source that same node.
    1107                 :            :          *
    1108                 :            :          * A single scan of tmp is sufficient.  As we go, we remember the
    1109                 :            :          * most recent <s, d> pair that moved (s != d).  If we find a pair
    1110                 :            :          * that not only moved, but what's better, moved to an empty slot
    1111                 :            :          * (d is not set in tmp), then we break out then, with that pair.
    1112                 :            :          * Otherwise when we finish scanning from_tmp, we at least have the
    1113                 :            :          * most recent <s, d> pair that moved.  If we get all the way through
    1114                 :            :          * the scan of tmp without finding any node that moved, much less
    1115                 :            :          * moved to an empty node, then there is nothing left worth migrating.
    1116                 :            :          */
    1117                 :            : 
    1118                 :          0 :         tmp = *from;
    1119         [ #  # ]:          0 :         while (!nodes_empty(tmp)) {
    1120                 :          0 :                 int s,d;
    1121                 :          0 :                 int source = NUMA_NO_NODE;
    1122                 :          0 :                 int dest = 0;
    1123                 :            : 
    1124         [ #  # ]:          0 :                 for_each_node_mask(s, tmp) {
    1125                 :            : 
    1126                 :            :                         /*
    1127                 :            :                          * do_migrate_pages() tries to maintain the relative
    1128                 :            :                          * node relationship of the pages established between
    1129                 :            :                          * threads and memory areas.
    1130                 :            :                          *
    1131                 :            :                          * However if the number of source nodes is not equal to
    1132                 :            :                          * the number of destination nodes we can not preserve
    1133                 :            :                          * this node relative relationship.  In that case, skip
    1134                 :            :                          * copying memory from a node that is in the destination
    1135                 :            :                          * mask.
    1136                 :            :                          *
    1137                 :            :                          * Example: [2,3,4] -> [3,4,5] moves everything.
    1138                 :            :                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
    1139                 :            :                          */
    1140                 :            : 
    1141   [ #  #  #  # ]:          0 :                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
    1142                 :          0 :                                                 (node_isset(s, *to)))
    1143                 :          0 :                                 continue;
    1144                 :            : 
    1145                 :          0 :                         d = node_remap(s, *from, *to);
    1146         [ #  # ]:          0 :                         if (s == d)
    1147                 :          0 :                                 continue;
    1148                 :            : 
    1149                 :          0 :                         source = s;     /* Node moved. Memorize */
    1150                 :          0 :                         dest = d;
    1151                 :            : 
    1152                 :            :                         /* dest not in remaining from nodes? */
    1153         [ #  # ]:          0 :                         if (!node_isset(dest, tmp))
    1154                 :            :                                 break;
    1155                 :            :                 }
    1156         [ #  # ]:          0 :                 if (source == NUMA_NO_NODE)
    1157                 :            :                         break;
    1158                 :            : 
    1159                 :          0 :                 node_clear(source, tmp);
    1160                 :          0 :                 err = migrate_to_node(mm, source, dest, flags);
    1161         [ #  # ]:          0 :                 if (err > 0)
    1162                 :          0 :                         busy += err;
    1163         [ #  # ]:          0 :                 if (err < 0)
    1164                 :            :                         break;
    1165                 :            :         }
    1166                 :          0 :         up_read(&mm->mmap_sem);
    1167         [ #  # ]:          0 :         if (err < 0)
    1168                 :          0 :                 return err;
    1169                 :            :         return busy;
    1170                 :            : 
    1171                 :            : }
    1172                 :            : 
    1173                 :            : /*
    1174                 :            :  * Allocate a new page for page migration based on vma policy.
    1175                 :            :  * Start by assuming the page is mapped by the same vma as contains @start.
    1176                 :            :  * Search forward from there, if not.  N.B., this assumes that the
    1177                 :            :  * list of pages handed to migrate_pages()--which is how we get here--
    1178                 :            :  * is in virtual address order.
    1179                 :            :  */
    1180                 :          0 : static struct page *new_page(struct page *page, unsigned long start)
    1181                 :            : {
    1182                 :          0 :         struct vm_area_struct *vma;
    1183                 :          0 :         unsigned long uninitialized_var(address);
    1184                 :            : 
    1185                 :          0 :         vma = find_vma(current->mm, start);
    1186         [ #  # ]:          0 :         while (vma) {
    1187                 :          0 :                 address = page_address_in_vma(page, vma);
    1188         [ #  # ]:          0 :                 if (address != -EFAULT)
    1189                 :            :                         break;
    1190                 :          0 :                 vma = vma->vm_next;
    1191                 :            :         }
    1192                 :            : 
    1193         [ #  # ]:          0 :         if (PageHuge(page)) {
    1194         [ #  # ]:          0 :                 return alloc_huge_page_vma(page_hstate(compound_head(page)),
    1195                 :            :                                 vma, address);
    1196                 :          0 :         } else if (PageTransHuge(page)) {
    1197                 :            :                 struct page *thp;
    1198                 :            : 
    1199                 :            :                 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
    1200                 :            :                                          HPAGE_PMD_ORDER);
    1201                 :            :                 if (!thp)
    1202                 :            :                         return NULL;
    1203                 :            :                 prep_transhuge_page(thp);
    1204                 :            :                 return thp;
    1205                 :            :         }
    1206                 :            :         /*
    1207                 :            :          * if !vma, alloc_page_vma() will use task or system default policy
    1208                 :            :          */
    1209                 :          0 :         return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
    1210                 :            :                         vma, address);
    1211                 :            : }
    1212                 :            : #else
    1213                 :            : 
    1214                 :            : static int migrate_page_add(struct page *page, struct list_head *pagelist,
    1215                 :            :                                 unsigned long flags)
    1216                 :            : {
    1217                 :            :         return -EIO;
    1218                 :            : }
    1219                 :            : 
    1220                 :            : int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
    1221                 :            :                      const nodemask_t *to, int flags)
    1222                 :            : {
    1223                 :            :         return -ENOSYS;
    1224                 :            : }
    1225                 :            : 
    1226                 :            : static struct page *new_page(struct page *page, unsigned long start)
    1227                 :            : {
    1228                 :            :         return NULL;
    1229                 :            : }
    1230                 :            : #endif
    1231                 :            : 
    1232                 :          0 : static long do_mbind(unsigned long start, unsigned long len,
    1233                 :            :                      unsigned short mode, unsigned short mode_flags,
    1234                 :            :                      nodemask_t *nmask, unsigned long flags)
    1235                 :            : {
    1236         [ #  # ]:          0 :         struct mm_struct *mm = current->mm;
    1237                 :          0 :         struct mempolicy *new;
    1238                 :          0 :         unsigned long end;
    1239                 :          0 :         int err;
    1240                 :          0 :         int ret;
    1241                 :          0 :         LIST_HEAD(pagelist);
    1242                 :            : 
    1243         [ #  # ]:          0 :         if (flags & ~(unsigned long)MPOL_MF_VALID)
    1244                 :            :                 return -EINVAL;
    1245   [ #  #  #  # ]:          0 :         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
    1246                 :            :                 return -EPERM;
    1247                 :            : 
    1248         [ #  # ]:          0 :         if (start & ~PAGE_MASK)
    1249                 :            :                 return -EINVAL;
    1250                 :            : 
    1251         [ #  # ]:          0 :         if (mode == MPOL_DEFAULT)
    1252                 :          0 :                 flags &= ~MPOL_MF_STRICT;
    1253                 :            : 
    1254                 :          0 :         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
    1255                 :          0 :         end = start + len;
    1256                 :            : 
    1257         [ #  # ]:          0 :         if (end < start)
    1258                 :            :                 return -EINVAL;
    1259         [ #  # ]:          0 :         if (end == start)
    1260                 :            :                 return 0;
    1261                 :            : 
    1262                 :          0 :         new = mpol_new(mode, mode_flags, nmask);
    1263         [ #  # ]:          0 :         if (IS_ERR(new))
    1264                 :          0 :                 return PTR_ERR(new);
    1265                 :            : 
    1266                 :          0 :         if (flags & MPOL_MF_LAZY)
    1267                 :            :                 new->flags |= MPOL_F_MOF;
    1268                 :            : 
    1269                 :            :         /*
    1270                 :            :          * If we are using the default policy then operation
    1271                 :            :          * on discontinuous address spaces is okay after all
    1272                 :            :          */
    1273         [ #  # ]:          0 :         if (!new)
    1274                 :          0 :                 flags |= MPOL_MF_DISCONTIG_OK;
    1275                 :            : 
    1276                 :          0 :         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
    1277                 :            :                  start, start + len, mode, mode_flags,
    1278                 :            :                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
    1279                 :            : 
    1280         [ #  # ]:          0 :         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
    1281                 :            : 
    1282                 :          0 :                 err = migrate_prep();
    1283         [ #  # ]:          0 :                 if (err)
    1284                 :          0 :                         goto mpol_out;
    1285                 :            :         }
    1286                 :            :         {
    1287                 :          0 :                 NODEMASK_SCRATCH(scratch);
    1288                 :          0 :                 if (scratch) {
    1289                 :          0 :                         down_write(&mm->mmap_sem);
    1290                 :          0 :                         task_lock(current);
    1291                 :          0 :                         err = mpol_set_nodemask(new, nmask, scratch);
    1292                 :          0 :                         task_unlock(current);
    1293         [ #  # ]:          0 :                         if (err)
    1294                 :          0 :                                 up_write(&mm->mmap_sem);
    1295                 :            :                 } else
    1296                 :            :                         err = -ENOMEM;
    1297                 :          0 :                 NODEMASK_SCRATCH_FREE(scratch);
    1298                 :            :         }
    1299         [ #  # ]:          0 :         if (err)
    1300                 :          0 :                 goto mpol_out;
    1301                 :            : 
    1302                 :          0 :         ret = queue_pages_range(mm, start, end, nmask,
    1303                 :            :                           flags | MPOL_MF_INVERT, &pagelist);
    1304                 :            : 
    1305         [ #  # ]:          0 :         if (ret < 0) {
    1306                 :          0 :                 err = ret;
    1307                 :          0 :                 goto up_out;
    1308                 :            :         }
    1309                 :            : 
    1310                 :          0 :         err = mbind_range(mm, start, end, new);
    1311                 :            : 
    1312         [ #  # ]:          0 :         if (!err) {
    1313                 :          0 :                 int nr_failed = 0;
    1314                 :            : 
    1315         [ #  # ]:          0 :                 if (!list_empty(&pagelist)) {
    1316         [ #  # ]:          0 :                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
    1317                 :          0 :                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
    1318                 :            :                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
    1319         [ #  # ]:          0 :                         if (nr_failed)
    1320                 :          0 :                                 putback_movable_pages(&pagelist);
    1321                 :            :                 }
    1322                 :            : 
    1323   [ #  #  #  #  :          0 :                 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
                   #  # ]
    1324                 :          0 :                         err = -EIO;
    1325                 :            :         } else {
    1326                 :          0 : up_out:
    1327         [ #  # ]:          0 :                 if (!list_empty(&pagelist))
    1328                 :          0 :                         putback_movable_pages(&pagelist);
    1329                 :            :         }
    1330                 :            : 
    1331                 :          0 :         up_write(&mm->mmap_sem);
    1332                 :          0 : mpol_out:
    1333         [ #  # ]:          0 :         mpol_put(new);
    1334                 :          0 :         return err;
    1335                 :            : }
    1336                 :            : 
    1337                 :            : /*
    1338                 :            :  * User space interface with variable sized bitmaps for nodelists.
    1339                 :            :  */
    1340                 :            : 
    1341                 :            : /* Copy a node mask from user space. */
    1342                 :          0 : static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
    1343                 :            :                      unsigned long maxnode)
    1344                 :            : {
    1345                 :          0 :         unsigned long k;
    1346                 :          0 :         unsigned long t;
    1347                 :          0 :         unsigned long nlongs;
    1348                 :          0 :         unsigned long endmask;
    1349                 :            : 
    1350                 :          0 :         --maxnode;
    1351         [ #  # ]:          0 :         nodes_clear(*nodes);
    1352         [ #  # ]:          0 :         if (maxnode == 0 || !nmask)
    1353                 :            :                 return 0;
    1354         [ #  # ]:          0 :         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
    1355                 :            :                 return -EINVAL;
    1356                 :            : 
    1357                 :          0 :         nlongs = BITS_TO_LONGS(maxnode);
    1358         [ #  # ]:          0 :         if ((maxnode % BITS_PER_LONG) == 0)
    1359                 :            :                 endmask = ~0UL;
    1360                 :            :         else
    1361                 :          0 :                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
    1362                 :            : 
    1363                 :            :         /*
    1364                 :            :          * When the user specified more nodes than supported just check
    1365                 :            :          * if the non supported part is all zero.
    1366                 :            :          *
    1367                 :            :          * If maxnode have more longs than MAX_NUMNODES, check
    1368                 :            :          * the bits in that area first. And then go through to
    1369                 :            :          * check the rest bits which equal or bigger than MAX_NUMNODES.
    1370                 :            :          * Otherwise, just check bits [MAX_NUMNODES, maxnode).
    1371                 :            :          */
    1372         [ #  # ]:          0 :         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
    1373         [ #  # ]:          0 :                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
    1374         [ #  # ]:          0 :                         if (get_user(t, nmask + k))
    1375                 :            :                                 return -EFAULT;
    1376         [ #  # ]:          0 :                         if (k == nlongs - 1) {
    1377         [ #  # ]:          0 :                                 if (t & endmask)
    1378                 :            :                                         return -EINVAL;
    1379         [ #  # ]:          0 :                         } else if (t)
    1380                 :            :                                 return -EINVAL;
    1381                 :            :                 }
    1382                 :            :                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
    1383                 :            :                 endmask = ~0UL;
    1384                 :            :         }
    1385                 :            : 
    1386                 :          0 :         if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
    1387                 :            :                 unsigned long valid_mask = endmask;
    1388                 :            : 
    1389                 :            :                 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
    1390                 :            :                 if (get_user(t, nmask + nlongs - 1))
    1391                 :            :                         return -EFAULT;
    1392                 :            :                 if (t & valid_mask)
    1393                 :            :                         return -EINVAL;
    1394                 :            :         }
    1395                 :            : 
    1396   [ #  #  #  # ]:          0 :         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
    1397                 :            :                 return -EFAULT;
    1398                 :          0 :         nodes_addr(*nodes)[nlongs-1] &= endmask;
    1399                 :          0 :         return 0;
    1400                 :            : }
    1401                 :            : 
    1402                 :            : /* Copy a kernel node mask to user space */
    1403                 :          0 : static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
    1404                 :            :                               nodemask_t *nodes)
    1405                 :            : {
    1406                 :          0 :         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
    1407                 :          0 :         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
    1408                 :            : 
    1409         [ #  # ]:          0 :         if (copy > nbytes) {
    1410         [ #  # ]:          0 :                 if (copy > PAGE_SIZE)
    1411                 :            :                         return -EINVAL;
    1412         [ #  # ]:          0 :                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
    1413                 :            :                         return -EFAULT;
    1414                 :            :                 copy = nbytes;
    1415                 :            :         }
    1416   [ #  #  #  # ]:          0 :         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
    1417                 :            : }
    1418                 :            : 
    1419                 :          0 : static long kernel_mbind(unsigned long start, unsigned long len,
    1420                 :            :                          unsigned long mode, const unsigned long __user *nmask,
    1421                 :            :                          unsigned long maxnode, unsigned int flags)
    1422                 :            : {
    1423                 :          0 :         nodemask_t nodes;
    1424                 :          0 :         int err;
    1425                 :          0 :         unsigned short mode_flags;
    1426                 :            : 
    1427                 :          0 :         start = untagged_addr(start);
    1428                 :          0 :         mode_flags = mode & MPOL_MODE_FLAGS;
    1429                 :          0 :         mode &= ~MPOL_MODE_FLAGS;
    1430         [ #  # ]:          0 :         if (mode >= MPOL_MAX)
    1431                 :            :                 return -EINVAL;
    1432   [ #  #  #  # ]:          0 :         if ((mode_flags & MPOL_F_STATIC_NODES) &&
    1433                 :            :             (mode_flags & MPOL_F_RELATIVE_NODES))
    1434                 :            :                 return -EINVAL;
    1435                 :          0 :         err = get_nodes(&nodes, nmask, maxnode);
    1436         [ #  # ]:          0 :         if (err)
    1437                 :          0 :                 return err;
    1438                 :          0 :         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
    1439                 :            : }
    1440                 :            : 
    1441                 :          0 : SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
    1442                 :            :                 unsigned long, mode, const unsigned long __user *, nmask,
    1443                 :            :                 unsigned long, maxnode, unsigned int, flags)
    1444                 :            : {
    1445                 :          0 :         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
    1446                 :            : }
    1447                 :            : 
    1448                 :            : /* Set the process memory policy */
    1449                 :          0 : static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
    1450                 :            :                                  unsigned long maxnode)
    1451                 :            : {
    1452                 :          0 :         int err;
    1453                 :          0 :         nodemask_t nodes;
    1454                 :          0 :         unsigned short flags;
    1455                 :            : 
    1456                 :          0 :         flags = mode & MPOL_MODE_FLAGS;
    1457                 :          0 :         mode &= ~MPOL_MODE_FLAGS;
    1458         [ #  # ]:          0 :         if ((unsigned int)mode >= MPOL_MAX)
    1459                 :            :                 return -EINVAL;
    1460   [ #  #  #  # ]:          0 :         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
    1461                 :            :                 return -EINVAL;
    1462                 :          0 :         err = get_nodes(&nodes, nmask, maxnode);
    1463         [ #  # ]:          0 :         if (err)
    1464                 :          0 :                 return err;
    1465                 :          0 :         return do_set_mempolicy(mode, flags, &nodes);
    1466                 :            : }
    1467                 :            : 
    1468                 :          0 : SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
    1469                 :            :                 unsigned long, maxnode)
    1470                 :            : {
    1471                 :          0 :         return kernel_set_mempolicy(mode, nmask, maxnode);
    1472                 :            : }
    1473                 :            : 
    1474                 :          0 : static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
    1475                 :            :                                 const unsigned long __user *old_nodes,
    1476                 :            :                                 const unsigned long __user *new_nodes)
    1477                 :            : {
    1478                 :          0 :         struct mm_struct *mm = NULL;
    1479                 :          0 :         struct task_struct *task;
    1480                 :          0 :         nodemask_t task_nodes;
    1481                 :          0 :         int err;
    1482                 :          0 :         nodemask_t *old;
    1483                 :          0 :         nodemask_t *new;
    1484                 :          0 :         NODEMASK_SCRATCH(scratch);
    1485                 :            : 
    1486                 :          0 :         if (!scratch)
    1487                 :            :                 return -ENOMEM;
    1488                 :            : 
    1489                 :          0 :         old = &scratch->mask1;
    1490                 :          0 :         new = &scratch->mask2;
    1491                 :            : 
    1492                 :          0 :         err = get_nodes(old, old_nodes, maxnode);
    1493         [ #  # ]:          0 :         if (err)
    1494                 :          0 :                 goto out;
    1495                 :            : 
    1496                 :          0 :         err = get_nodes(new, new_nodes, maxnode);
    1497         [ #  # ]:          0 :         if (err)
    1498                 :          0 :                 goto out;
    1499                 :            : 
    1500                 :            :         /* Find the mm_struct */
    1501                 :          0 :         rcu_read_lock();
    1502         [ #  # ]:          0 :         task = pid ? find_task_by_vpid(pid) : current;
    1503         [ #  # ]:          0 :         if (!task) {
    1504                 :          0 :                 rcu_read_unlock();
    1505                 :          0 :                 err = -ESRCH;
    1506                 :          0 :                 goto out;
    1507                 :            :         }
    1508                 :          0 :         get_task_struct(task);
    1509                 :            : 
    1510                 :          0 :         err = -EINVAL;
    1511                 :            : 
    1512                 :            :         /*
    1513                 :            :          * Check if this process has the right to modify the specified process.
    1514                 :            :          * Use the regular "ptrace_may_access()" checks.
    1515                 :            :          */
    1516         [ #  # ]:          0 :         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
    1517                 :          0 :                 rcu_read_unlock();
    1518                 :          0 :                 err = -EPERM;
    1519                 :          0 :                 goto out_put;
    1520                 :            :         }
    1521                 :          0 :         rcu_read_unlock();
    1522                 :            : 
    1523                 :          0 :         task_nodes = cpuset_mems_allowed(task);
    1524                 :            :         /* Is the user allowed to access the target nodes? */
    1525   [ #  #  #  # ]:          0 :         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
    1526                 :          0 :                 err = -EPERM;
    1527                 :          0 :                 goto out_put;
    1528                 :            :         }
    1529                 :            : 
    1530                 :          0 :         task_nodes = cpuset_mems_allowed(current);
    1531                 :          0 :         nodes_and(*new, *new, task_nodes);
    1532         [ #  # ]:          0 :         if (nodes_empty(*new))
    1533                 :          0 :                 goto out_put;
    1534                 :            : 
    1535                 :          0 :         err = security_task_movememory(task);
    1536         [ #  # ]:          0 :         if (err)
    1537                 :          0 :                 goto out_put;
    1538                 :            : 
    1539                 :          0 :         mm = get_task_mm(task);
    1540                 :          0 :         put_task_struct(task);
    1541                 :            : 
    1542         [ #  # ]:          0 :         if (!mm) {
    1543                 :          0 :                 err = -EINVAL;
    1544                 :          0 :                 goto out;
    1545                 :            :         }
    1546                 :            : 
    1547         [ #  # ]:          0 :         err = do_migrate_pages(mm, old, new,
    1548                 :          0 :                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
    1549                 :            : 
    1550                 :          0 :         mmput(mm);
    1551                 :          0 : out:
    1552                 :          0 :         NODEMASK_SCRATCH_FREE(scratch);
    1553                 :            : 
    1554                 :          0 :         return err;
    1555                 :            : 
    1556                 :          0 : out_put:
    1557                 :          0 :         put_task_struct(task);
    1558                 :          0 :         goto out;
    1559                 :            : 
    1560                 :            : }
    1561                 :            : 
    1562                 :          0 : SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
    1563                 :            :                 const unsigned long __user *, old_nodes,
    1564                 :            :                 const unsigned long __user *, new_nodes)
    1565                 :            : {
    1566                 :          0 :         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
    1567                 :            : }
    1568                 :            : 
    1569                 :            : 
    1570                 :            : /* Retrieve NUMA policy */
    1571                 :          0 : static int kernel_get_mempolicy(int __user *policy,
    1572                 :            :                                 unsigned long __user *nmask,
    1573                 :            :                                 unsigned long maxnode,
    1574                 :            :                                 unsigned long addr,
    1575                 :            :                                 unsigned long flags)
    1576                 :            : {
    1577                 :          0 :         int err;
    1578                 :          0 :         int uninitialized_var(pval);
    1579                 :          0 :         nodemask_t nodes;
    1580                 :            : 
    1581                 :          0 :         addr = untagged_addr(addr);
    1582                 :            : 
    1583   [ #  #  #  # ]:          0 :         if (nmask != NULL && maxnode < nr_node_ids)
    1584                 :            :                 return -EINVAL;
    1585                 :            : 
    1586                 :          0 :         err = do_get_mempolicy(&pval, &nodes, addr, flags);
    1587                 :            : 
    1588         [ #  # ]:          0 :         if (err)
    1589                 :            :                 return err;
    1590                 :            : 
    1591   [ #  #  #  # ]:          0 :         if (policy && put_user(pval, policy))
    1592                 :            :                 return -EFAULT;
    1593                 :            : 
    1594         [ #  # ]:          0 :         if (nmask)
    1595                 :          0 :                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
    1596                 :            : 
    1597                 :            :         return err;
    1598                 :            : }
    1599                 :            : 
    1600                 :          0 : SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
    1601                 :            :                 unsigned long __user *, nmask, unsigned long, maxnode,
    1602                 :            :                 unsigned long, addr, unsigned long, flags)
    1603                 :            : {
    1604                 :          0 :         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
    1605                 :            : }
    1606                 :            : 
    1607                 :            : #ifdef CONFIG_COMPAT
    1608                 :            : 
    1609                 :          0 : COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
    1610                 :            :                        compat_ulong_t __user *, nmask,
    1611                 :            :                        compat_ulong_t, maxnode,
    1612                 :            :                        compat_ulong_t, addr, compat_ulong_t, flags)
    1613                 :            : {
    1614                 :          0 :         long err;
    1615                 :          0 :         unsigned long __user *nm = NULL;
    1616                 :          0 :         unsigned long nr_bits, alloc_size;
    1617                 :          0 :         DECLARE_BITMAP(bm, MAX_NUMNODES);
    1618                 :            : 
    1619                 :          0 :         nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
    1620                 :          0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1621                 :            : 
    1622         [ #  # ]:          0 :         if (nmask)
    1623                 :          0 :                 nm = compat_alloc_user_space(alloc_size);
    1624                 :            : 
    1625                 :          0 :         err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
    1626                 :            : 
    1627         [ #  # ]:          0 :         if (!err && nmask) {
    1628                 :          0 :                 unsigned long copy_size;
    1629                 :          0 :                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
    1630                 :          0 :                 err = copy_from_user(bm, nm, copy_size);
    1631                 :            :                 /* ensure entire bitmap is zeroed */
    1632                 :          0 :                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
    1633                 :          0 :                 err |= compat_put_bitmap(nmask, bm, nr_bits);
    1634                 :            :         }
    1635                 :            : 
    1636                 :          0 :         return err;
    1637                 :            : }
    1638                 :            : 
    1639                 :          0 : COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
    1640                 :            :                        compat_ulong_t, maxnode)
    1641                 :            : {
    1642                 :          0 :         unsigned long __user *nm = NULL;
    1643                 :          0 :         unsigned long nr_bits, alloc_size;
    1644                 :          0 :         DECLARE_BITMAP(bm, MAX_NUMNODES);
    1645                 :            : 
    1646                 :          0 :         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
    1647                 :          0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1648                 :            : 
    1649         [ #  # ]:          0 :         if (nmask) {
    1650         [ #  # ]:          0 :                 if (compat_get_bitmap(bm, nmask, nr_bits))
    1651                 :            :                         return -EFAULT;
    1652                 :          0 :                 nm = compat_alloc_user_space(alloc_size);
    1653   [ #  #  #  # ]:          0 :                 if (copy_to_user(nm, bm, alloc_size))
    1654                 :            :                         return -EFAULT;
    1655                 :            :         }
    1656                 :            : 
    1657                 :          0 :         return kernel_set_mempolicy(mode, nm, nr_bits+1);
    1658                 :            : }
    1659                 :            : 
    1660                 :          0 : COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
    1661                 :            :                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
    1662                 :            :                        compat_ulong_t, maxnode, compat_ulong_t, flags)
    1663                 :            : {
    1664                 :          0 :         unsigned long __user *nm = NULL;
    1665                 :          0 :         unsigned long nr_bits, alloc_size;
    1666                 :          0 :         nodemask_t bm;
    1667                 :            : 
    1668                 :          0 :         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
    1669                 :          0 :         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1670                 :            : 
    1671         [ #  # ]:          0 :         if (nmask) {
    1672         [ #  # ]:          0 :                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
    1673                 :            :                         return -EFAULT;
    1674                 :          0 :                 nm = compat_alloc_user_space(alloc_size);
    1675   [ #  #  #  # ]:          0 :                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
    1676                 :            :                         return -EFAULT;
    1677                 :            :         }
    1678                 :            : 
    1679                 :          0 :         return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
    1680                 :            : }
    1681                 :            : 
    1682                 :          0 : COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
    1683                 :            :                        compat_ulong_t, maxnode,
    1684                 :            :                        const compat_ulong_t __user *, old_nodes,
    1685                 :            :                        const compat_ulong_t __user *, new_nodes)
    1686                 :            : {
    1687                 :          0 :         unsigned long __user *old = NULL;
    1688                 :          0 :         unsigned long __user *new = NULL;
    1689                 :          0 :         nodemask_t tmp_mask;
    1690                 :          0 :         unsigned long nr_bits;
    1691                 :          0 :         unsigned long size;
    1692                 :            : 
    1693                 :          0 :         nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
    1694                 :          0 :         size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
    1695         [ #  # ]:          0 :         if (old_nodes) {
    1696         [ #  # ]:          0 :                 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
    1697                 :            :                         return -EFAULT;
    1698         [ #  # ]:          0 :                 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
    1699         [ #  # ]:          0 :                 if (new_nodes)
    1700                 :          0 :                         new = old + size / sizeof(unsigned long);
    1701   [ #  #  #  # ]:          0 :                 if (copy_to_user(old, nodes_addr(tmp_mask), size))
    1702                 :            :                         return -EFAULT;
    1703                 :            :         }
    1704         [ #  # ]:          0 :         if (new_nodes) {
    1705         [ #  # ]:          0 :                 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
    1706                 :            :                         return -EFAULT;
    1707         [ #  # ]:          0 :                 if (new == NULL)
    1708                 :          0 :                         new = compat_alloc_user_space(size);
    1709   [ #  #  #  # ]:          0 :                 if (copy_to_user(new, nodes_addr(tmp_mask), size))
    1710                 :            :                         return -EFAULT;
    1711                 :            :         }
    1712                 :          0 :         return kernel_migrate_pages(pid, nr_bits + 1, old, new);
    1713                 :            : }
    1714                 :            : 
    1715                 :            : #endif /* CONFIG_COMPAT */
    1716                 :            : 
    1717                 :     120552 : struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
    1718                 :            :                                                 unsigned long addr)
    1719                 :            : {
    1720                 :     120552 :         struct mempolicy *pol = NULL;
    1721                 :            : 
    1722         [ +  - ]:     120552 :         if (vma) {
    1723   [ +  +  -  + ]:     120552 :                 if (vma->vm_ops && vma->vm_ops->get_policy) {
    1724                 :          0 :                         pol = vma->vm_ops->get_policy(vma, addr);
    1725         [ -  + ]:     120552 :                 } else if (vma->vm_policy) {
    1726                 :          0 :                         pol = vma->vm_policy;
    1727                 :            : 
    1728                 :            :                         /*
    1729                 :            :                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
    1730                 :            :                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
    1731                 :            :                          * count on these policies which will be dropped by
    1732                 :            :                          * mpol_cond_put() later
    1733                 :            :                          */
    1734         [ #  # ]:          0 :                         if (mpol_needs_cond_ref(pol))
    1735                 :          0 :                                 mpol_get(pol);
    1736                 :            :                 }
    1737                 :            :         }
    1738                 :            : 
    1739                 :     120552 :         return pol;
    1740                 :            : }
    1741                 :            : 
    1742                 :            : /*
    1743                 :            :  * get_vma_policy(@vma, @addr)
    1744                 :            :  * @vma: virtual memory area whose policy is sought
    1745                 :            :  * @addr: address in @vma for shared policy lookup
    1746                 :            :  *
    1747                 :            :  * Returns effective policy for a VMA at specified address.
    1748                 :            :  * Falls back to current->mempolicy or system default policy, as necessary.
    1749                 :            :  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
    1750                 :            :  * count--added by the get_policy() vm_op, as appropriate--to protect against
    1751                 :            :  * freeing by another task.  It is the caller's responsibility to free the
    1752                 :            :  * extra reference for shared policies.
    1753                 :            :  */
    1754                 :     120552 : static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
    1755                 :            :                                                 unsigned long addr)
    1756                 :            : {
    1757                 :     120552 :         struct mempolicy *pol = __get_vma_policy(vma, addr);
    1758                 :            : 
    1759         [ +  - ]:     120552 :         if (!pol)
    1760         [ +  - ]:     120552 :                 pol = get_task_policy(current);
    1761                 :            : 
    1762                 :     120552 :         return pol;
    1763                 :            : }
    1764                 :            : 
    1765                 :          0 : bool vma_policy_mof(struct vm_area_struct *vma)
    1766                 :            : {
    1767                 :          0 :         struct mempolicy *pol;
    1768                 :            : 
    1769   [ #  #  #  # ]:          0 :         if (vma->vm_ops && vma->vm_ops->get_policy) {
    1770                 :          0 :                 bool ret = false;
    1771                 :            : 
    1772                 :          0 :                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
    1773   [ #  #  #  # ]:          0 :                 if (pol && (pol->flags & MPOL_F_MOF))
    1774                 :          0 :                         ret = true;
    1775         [ #  # ]:          0 :                 mpol_cond_put(pol);
    1776                 :            : 
    1777                 :          0 :                 return ret;
    1778                 :            :         }
    1779                 :            : 
    1780                 :          0 :         pol = vma->vm_policy;
    1781         [ #  # ]:          0 :         if (!pol)
    1782         [ #  # ]:          0 :                 pol = get_task_policy(current);
    1783                 :            : 
    1784                 :          0 :         return pol->flags & MPOL_F_MOF;
    1785                 :            : }
    1786                 :            : 
    1787                 :          0 : static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
    1788                 :            : {
    1789                 :          0 :         enum zone_type dynamic_policy_zone = policy_zone;
    1790                 :            : 
    1791         [ #  # ]:          0 :         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
    1792                 :            : 
    1793                 :            :         /*
    1794                 :            :          * if policy->v.nodes has movable memory only,
    1795                 :            :          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
    1796                 :            :          *
    1797                 :            :          * policy->v.nodes is intersect with node_states[N_MEMORY].
    1798                 :            :          * so if the following test faile, it implies
    1799                 :            :          * policy->v.nodes has movable memory only.
    1800                 :            :          */
    1801         [ #  # ]:          0 :         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
    1802                 :          0 :                 dynamic_policy_zone = ZONE_MOVABLE;
    1803                 :            : 
    1804                 :          0 :         return zone >= dynamic_policy_zone;
    1805                 :            : }
    1806                 :            : 
    1807                 :            : /*
    1808                 :            :  * Return a nodemask representing a mempolicy for filtering nodes for
    1809                 :            :  * page allocation
    1810                 :            :  */
    1811                 :     326028 : static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
    1812                 :            : {
    1813                 :            :         /* Lower zones don't get a nodemask applied for MPOL_BIND */
    1814   [ -  +  -  - ]:     326028 :         if (unlikely(policy->mode == MPOL_BIND) &&
    1815         [ #  # ]:          0 :                         apply_policy_zone(policy, gfp_zone(gfp)) &&
    1816                 :          0 :                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
    1817                 :          0 :                 return &policy->v.nodes;
    1818                 :            : 
    1819                 :            :         return NULL;
    1820                 :            : }
    1821                 :            : 
    1822                 :            : /* Return the node id preferred by the given mempolicy, or the given id */
    1823                 :     326025 : static int policy_node(gfp_t gfp, struct mempolicy *policy,
    1824                 :            :                                                                 int nd)
    1825                 :            : {
    1826   [ +  -  +  + ]:     326025 :         if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
    1827                 :     325839 :                 nd = policy->v.preferred_node;
    1828                 :            :         else {
    1829                 :            :                 /*
    1830                 :            :                  * __GFP_THISNODE shouldn't even be used with the bind policy
    1831                 :            :                  * because we might easily break the expectation to stay on the
    1832                 :            :                  * requested node and not break the policy.
    1833                 :            :                  */
    1834   [ -  +  -  -  :        372 :                 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
                   -  + ]
    1835                 :            :         }
    1836                 :            : 
    1837                 :     326025 :         return nd;
    1838                 :            : }
    1839                 :            : 
    1840                 :            : /* Do dynamic interleaving for a process */
    1841                 :      30791 : static unsigned interleave_nodes(struct mempolicy *policy)
    1842                 :            : {
    1843                 :      30791 :         unsigned next;
    1844                 :      30791 :         struct task_struct *me = current;
    1845                 :            : 
    1846                 :      30791 :         next = next_node_in(me->il_prev, policy->v.nodes);
    1847   [ +  -  -  -  :      30791 :         if (next < MAX_NUMNODES)
                   +  - ]
    1848                 :      30791 :                 me->il_prev = next;
    1849                 :      18742 :         return next;
    1850                 :            : }
    1851                 :            : 
    1852                 :            : /*
    1853                 :            :  * Depending on the memory policy provide a node from which to allocate the
    1854                 :            :  * next slab entry.
    1855                 :            :  */
    1856                 :      66063 : unsigned int mempolicy_slab_node(void)
    1857                 :            : {
    1858                 :      66063 :         struct mempolicy *policy;
    1859         [ +  + ]:      66063 :         int node = numa_mem_id();
    1860                 :            : 
    1861         [ +  + ]:      66063 :         if (in_interrupt())
    1862                 :         12 :                 return node;
    1863                 :            : 
    1864         [ +  + ]:      66051 :         policy = current->mempolicy;
    1865   [ +  +  -  + ]:      66051 :         if (!policy || policy->flags & MPOL_F_LOCAL)
    1866                 :      54002 :                 return node;
    1867                 :            : 
    1868   [ -  +  -  - ]:      12049 :         switch (policy->mode) {
    1869                 :          0 :         case MPOL_PREFERRED:
    1870                 :            :                 /*
    1871                 :            :                  * handled MPOL_F_LOCAL above
    1872                 :            :                  */
    1873                 :          0 :                 return policy->v.preferred_node;
    1874                 :            : 
    1875                 :            :         case MPOL_INTERLEAVE:
    1876                 :      12049 :                 return interleave_nodes(policy);
    1877                 :            : 
    1878                 :            :         case MPOL_BIND: {
    1879                 :          0 :                 struct zoneref *z;
    1880                 :            : 
    1881                 :            :                 /*
    1882                 :            :                  * Follow bind policy behavior and start allocation at the
    1883                 :            :                  * first node.
    1884                 :            :                  */
    1885                 :          0 :                 struct zonelist *zonelist;
    1886         [ #  # ]:          0 :                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
    1887                 :          0 :                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
    1888         [ #  # ]:          0 :                 z = first_zones_zonelist(zonelist, highest_zoneidx,
    1889                 :            :                                                         &policy->v.nodes);
    1890         [ #  # ]:          0 :                 return z->zone ? zone_to_nid(z->zone) : node;
    1891                 :            :         }
    1892                 :            : 
    1893                 :          0 :         default:
    1894                 :          0 :                 BUG();
    1895                 :            :         }
    1896                 :            : }
    1897                 :            : 
    1898                 :            : /*
    1899                 :            :  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
    1900                 :            :  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
    1901                 :            :  * number of present nodes.
    1902                 :            :  */
    1903                 :          0 : static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
    1904                 :            : {
    1905                 :          0 :         unsigned nnodes = nodes_weight(pol->v.nodes);
    1906                 :          0 :         unsigned target;
    1907                 :          0 :         int i;
    1908                 :          0 :         int nid;
    1909                 :            : 
    1910         [ #  # ]:          0 :         if (!nnodes)
    1911                 :          0 :                 return numa_node_id();
    1912                 :          0 :         target = (unsigned int)n % nnodes;
    1913                 :          0 :         nid = first_node(pol->v.nodes);
    1914         [ #  # ]:          0 :         for (i = 0; i < target; i++)
    1915                 :          0 :                 nid = next_node(nid, pol->v.nodes);
    1916                 :          0 :         return nid;
    1917                 :            : }
    1918                 :            : 
    1919                 :            : /* Determine a node number for interleave */
    1920                 :          0 : static inline unsigned interleave_nid(struct mempolicy *pol,
    1921                 :            :                  struct vm_area_struct *vma, unsigned long addr, int shift)
    1922                 :            : {
    1923         [ #  # ]:          0 :         if (vma) {
    1924                 :          0 :                 unsigned long off;
    1925                 :            : 
    1926                 :            :                 /*
    1927                 :            :                  * for small pages, there is no difference between
    1928                 :            :                  * shift and PAGE_SHIFT, so the bit-shift is safe.
    1929                 :            :                  * for huge pages, since vm_pgoff is in units of small
    1930                 :            :                  * pages, we need to shift off the always 0 bits to get
    1931                 :            :                  * a useful offset.
    1932                 :            :                  */
    1933         [ #  # ]:          0 :                 BUG_ON(shift < PAGE_SHIFT);
    1934                 :          0 :                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
    1935                 :          0 :                 off += (addr - vma->vm_start) >> shift;
    1936                 :          0 :                 return offset_il_node(pol, off);
    1937                 :            :         } else
    1938                 :          0 :                 return interleave_nodes(pol);
    1939                 :            : }
    1940                 :            : 
    1941                 :            : #ifdef CONFIG_HUGETLBFS
    1942                 :            : /*
    1943                 :            :  * huge_node(@vma, @addr, @gfp_flags, @mpol)
    1944                 :            :  * @vma: virtual memory area whose policy is sought
    1945                 :            :  * @addr: address in @vma for shared policy lookup and interleave policy
    1946                 :            :  * @gfp_flags: for requested zone
    1947                 :            :  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
    1948                 :            :  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
    1949                 :            :  *
    1950                 :            :  * Returns a nid suitable for a huge page allocation and a pointer
    1951                 :            :  * to the struct mempolicy for conditional unref after allocation.
    1952                 :            :  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
    1953                 :            :  * @nodemask for filtering the zonelist.
    1954                 :            :  *
    1955                 :            :  * Must be protected by read_mems_allowed_begin()
    1956                 :            :  */
    1957                 :          0 : int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
    1958                 :            :                                 struct mempolicy **mpol, nodemask_t **nodemask)
    1959                 :            : {
    1960                 :          0 :         int nid;
    1961                 :            : 
    1962                 :          0 :         *mpol = get_vma_policy(vma, addr);
    1963                 :          0 :         *nodemask = NULL;       /* assume !MPOL_BIND */
    1964                 :            : 
    1965         [ #  # ]:          0 :         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
    1966                 :          0 :                 nid = interleave_nid(*mpol, vma, addr,
    1967                 :          0 :                                         huge_page_shift(hstate_vma(vma)));
    1968                 :            :         } else {
    1969                 :          0 :                 nid = policy_node(gfp_flags, *mpol, numa_node_id());
    1970         [ #  # ]:          0 :                 if ((*mpol)->mode == MPOL_BIND)
    1971                 :          0 :                         *nodemask = &(*mpol)->v.nodes;
    1972                 :            :         }
    1973                 :          0 :         return nid;
    1974                 :            : }
    1975                 :            : 
    1976                 :            : /*
    1977                 :            :  * init_nodemask_of_mempolicy
    1978                 :            :  *
    1979                 :            :  * If the current task's mempolicy is "default" [NULL], return 'false'
    1980                 :            :  * to indicate default policy.  Otherwise, extract the policy nodemask
    1981                 :            :  * for 'bind' or 'interleave' policy into the argument nodemask, or
    1982                 :            :  * initialize the argument nodemask to contain the single node for
    1983                 :            :  * 'preferred' or 'local' policy and return 'true' to indicate presence
    1984                 :            :  * of non-default mempolicy.
    1985                 :            :  *
    1986                 :            :  * We don't bother with reference counting the mempolicy [mpol_get/put]
    1987                 :            :  * because the current task is examining it's own mempolicy and a task's
    1988                 :            :  * mempolicy is only ever changed by the task itself.
    1989                 :            :  *
    1990                 :            :  * N.B., it is the caller's responsibility to free a returned nodemask.
    1991                 :            :  */
    1992                 :          0 : bool init_nodemask_of_mempolicy(nodemask_t *mask)
    1993                 :            : {
    1994                 :          0 :         struct mempolicy *mempolicy;
    1995                 :          0 :         int nid;
    1996                 :            : 
    1997   [ #  #  #  # ]:          0 :         if (!(mask && current->mempolicy))
    1998                 :            :                 return false;
    1999                 :            : 
    2000                 :          0 :         task_lock(current);
    2001      [ #  #  # ]:          0 :         mempolicy = current->mempolicy;
    2002      [ #  #  # ]:          0 :         switch (mempolicy->mode) {
    2003                 :          0 :         case MPOL_PREFERRED:
    2004         [ #  # ]:          0 :                 if (mempolicy->flags & MPOL_F_LOCAL)
    2005                 :          0 :                         nid = numa_node_id();
    2006                 :            :                 else
    2007                 :          0 :                         nid = mempolicy->v.preferred_node;
    2008                 :          0 :                 init_nodemask_of_node(mask, nid);
    2009                 :            :                 break;
    2010                 :            : 
    2011                 :          0 :         case MPOL_BIND:
    2012                 :            :                 /* Fall through */
    2013                 :            :         case MPOL_INTERLEAVE:
    2014                 :          0 :                 *mask =  mempolicy->v.nodes;
    2015                 :          0 :                 break;
    2016                 :            : 
    2017                 :          0 :         default:
    2018                 :          0 :                 BUG();
    2019                 :            :         }
    2020                 :          0 :         task_unlock(current);
    2021                 :            : 
    2022                 :          0 :         return true;
    2023                 :            : }
    2024                 :            : #endif
    2025                 :            : 
    2026                 :            : /*
    2027                 :            :  * mempolicy_nodemask_intersects
    2028                 :            :  *
    2029                 :            :  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
    2030                 :            :  * policy.  Otherwise, check for intersection between mask and the policy
    2031                 :            :  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
    2032                 :            :  * policy, always return true since it may allocate elsewhere on fallback.
    2033                 :            :  *
    2034                 :            :  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
    2035                 :            :  */
    2036                 :          0 : bool mempolicy_nodemask_intersects(struct task_struct *tsk,
    2037                 :            :                                         const nodemask_t *mask)
    2038                 :            : {
    2039                 :          0 :         struct mempolicy *mempolicy;
    2040                 :          0 :         bool ret = true;
    2041                 :            : 
    2042         [ #  # ]:          0 :         if (!mask)
    2043                 :            :                 return ret;
    2044                 :          0 :         task_lock(tsk);
    2045                 :          0 :         mempolicy = tsk->mempolicy;
    2046         [ #  # ]:          0 :         if (!mempolicy)
    2047                 :          0 :                 goto out;
    2048                 :            : 
    2049      [ #  #  # ]:          0 :         switch (mempolicy->mode) {
    2050                 :            :         case MPOL_PREFERRED:
    2051                 :            :                 /*
    2052                 :            :                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
    2053                 :            :                  * allocate from, they may fallback to other nodes when oom.
    2054                 :            :                  * Thus, it's possible for tsk to have allocated memory from
    2055                 :            :                  * nodes in mask.
    2056                 :            :                  */
    2057                 :            :                 break;
    2058                 :          0 :         case MPOL_BIND:
    2059                 :            :         case MPOL_INTERLEAVE:
    2060                 :          0 :                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
    2061                 :          0 :                 break;
    2062                 :          0 :         default:
    2063                 :          0 :                 BUG();
    2064                 :            :         }
    2065                 :          0 : out:
    2066                 :          0 :         task_unlock(tsk);
    2067                 :          0 :         return ret;
    2068                 :            : }
    2069                 :            : 
    2070                 :            : /* Allocate a page in interleaved policy.
    2071                 :            :    Own path because it needs to do special accounting. */
    2072                 :      18742 : static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
    2073                 :            :                                         unsigned nid)
    2074                 :            : {
    2075                 :      18742 :         struct page *page;
    2076                 :            : 
    2077                 :      18742 :         page = __alloc_pages(gfp, order, nid);
    2078                 :            :         /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
    2079   [ +  -  +  - ]:      37484 :         if (!static_branch_likely(&vm_numa_stat_key))
    2080                 :            :                 return page;
    2081   [ +  -  +  - ]:      18742 :         if (page && page_to_nid(page) == nid) {
    2082                 :      18742 :                 preempt_disable();
    2083                 :      18742 :                 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
    2084                 :      18742 :                 preempt_enable();
    2085                 :            :         }
    2086                 :            :         return page;
    2087                 :            : }
    2088                 :            : 
    2089                 :            : /**
    2090                 :            :  *      alloc_pages_vma - Allocate a page for a VMA.
    2091                 :            :  *
    2092                 :            :  *      @gfp:
    2093                 :            :  *      %GFP_USER    user allocation.
    2094                 :            :  *      %GFP_KERNEL  kernel allocations,
    2095                 :            :  *      %GFP_HIGHMEM highmem/user allocations,
    2096                 :            :  *      %GFP_FS      allocation should not call back into a file system.
    2097                 :            :  *      %GFP_ATOMIC  don't sleep.
    2098                 :            :  *
    2099                 :            :  *      @order:Order of the GFP allocation.
    2100                 :            :  *      @vma:  Pointer to VMA or NULL if not available.
    2101                 :            :  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
    2102                 :            :  *      @node: Which node to prefer for allocation (modulo policy).
    2103                 :            :  *      @hugepage: for hugepages try only the preferred node if possible
    2104                 :            :  *
    2105                 :            :  *      This function allocates a page from the kernel page pool and applies
    2106                 :            :  *      a NUMA policy associated with the VMA or the current process.
    2107                 :            :  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
    2108                 :            :  *      mm_struct of the VMA to prevent it from going away. Should be used for
    2109                 :            :  *      all allocations for pages that will be mapped into user space. Returns
    2110                 :            :  *      NULL when no page can be allocated.
    2111                 :            :  */
    2112                 :            : struct page *
    2113                 :     120552 : alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
    2114                 :            :                 unsigned long addr, int node, bool hugepage)
    2115                 :            : {
    2116                 :     120552 :         struct mempolicy *pol;
    2117                 :     120552 :         struct page *page;
    2118                 :     120552 :         int preferred_nid;
    2119                 :     120552 :         nodemask_t *nmask;
    2120                 :            : 
    2121                 :     120552 :         pol = get_vma_policy(vma, addr);
    2122                 :            : 
    2123         [ -  + ]:     120552 :         if (pol->mode == MPOL_INTERLEAVE) {
    2124                 :          0 :                 unsigned nid;
    2125                 :            : 
    2126                 :          0 :                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
    2127         [ #  # ]:          0 :                 mpol_cond_put(pol);
    2128                 :          0 :                 page = alloc_page_interleave(gfp, order, nid);
    2129                 :          0 :                 goto out;
    2130                 :            :         }
    2131                 :            : 
    2132                 :     120552 :         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
    2133                 :            :                 int hpage_node = node;
    2134                 :            : 
    2135                 :            :                 /*
    2136                 :            :                  * For hugepage allocation and non-interleave policy which
    2137                 :            :                  * allows the current node (or other explicitly preferred
    2138                 :            :                  * node) we only try to allocate from the current/preferred
    2139                 :            :                  * node and don't fall back to other nodes, as the cost of
    2140                 :            :                  * remote accesses would likely offset THP benefits.
    2141                 :            :                  *
    2142                 :            :                  * If the policy is interleave, or does not allow the current
    2143                 :            :                  * node in its nodemask, we allocate the standard way.
    2144                 :            :                  */
    2145                 :            :                 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
    2146                 :            :                         hpage_node = pol->v.preferred_node;
    2147                 :            : 
    2148                 :            :                 nmask = policy_nodemask(gfp, pol);
    2149                 :            :                 if (!nmask || node_isset(hpage_node, *nmask)) {
    2150                 :            :                         mpol_cond_put(pol);
    2151                 :            :                         /*
    2152                 :            :                          * First, try to allocate THP only on local node, but
    2153                 :            :                          * don't reclaim unnecessarily, just compact.
    2154                 :            :                          */
    2155                 :            :                         page = __alloc_pages_node(hpage_node,
    2156                 :            :                                 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
    2157                 :            : 
    2158                 :            :                         /*
    2159                 :            :                          * If hugepage allocations are configured to always
    2160                 :            :                          * synchronous compact or the vma has been madvised
    2161                 :            :                          * to prefer hugepage backing, retry allowing remote
    2162                 :            :                          * memory with both reclaim and compact as well.
    2163                 :            :                          */
    2164                 :            :                         if (!page && (gfp & __GFP_DIRECT_RECLAIM))
    2165                 :            :                                 page = __alloc_pages_node(hpage_node,
    2166                 :            :                                                                 gfp, order);
    2167                 :            : 
    2168                 :            :                         goto out;
    2169                 :            :                 }
    2170                 :            :         }
    2171                 :            : 
    2172                 :     120552 :         nmask = policy_nodemask(gfp, pol);
    2173                 :     120552 :         preferred_nid = policy_node(gfp, pol, node);
    2174                 :     120552 :         page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
    2175         [ +  - ]:     120552 :         mpol_cond_put(pol);
    2176                 :     120552 : out:
    2177                 :     120552 :         return page;
    2178                 :            : }
    2179                 :            : EXPORT_SYMBOL(alloc_pages_vma);
    2180                 :            : 
    2181                 :            : /**
    2182                 :            :  *      alloc_pages_current - Allocate pages.
    2183                 :            :  *
    2184                 :            :  *      @gfp:
    2185                 :            :  *              %GFP_USER   user allocation,
    2186                 :            :  *              %GFP_KERNEL kernel allocation,
    2187                 :            :  *              %GFP_HIGHMEM highmem allocation,
    2188                 :            :  *              %GFP_FS     don't call back into a file system.
    2189                 :            :  *              %GFP_ATOMIC don't sleep.
    2190                 :            :  *      @order: Power of two of allocation size in pages. 0 is a single page.
    2191                 :            :  *
    2192                 :            :  *      Allocate a page from the kernel page pool.  When not in
    2193                 :            :  *      interrupt context and apply the current process NUMA policy.
    2194                 :            :  *      Returns NULL when no page can be allocated.
    2195                 :            :  */
    2196                 :     224186 : struct page *alloc_pages_current(gfp_t gfp, unsigned order)
    2197                 :            : {
    2198                 :     224186 :         struct mempolicy *pol = &default_policy;
    2199                 :     224186 :         struct page *page;
    2200                 :            : 
    2201   [ +  +  +  - ]:     224186 :         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
    2202         [ +  + ]:     224174 :                 pol = get_task_policy(current);
    2203                 :            : 
    2204                 :            :         /*
    2205                 :            :          * No reference counting needed for current->mempolicy
    2206                 :            :          * nor system default_policy
    2207                 :            :          */
    2208         [ +  + ]:     224186 :         if (pol->mode == MPOL_INTERLEAVE)
    2209                 :      37484 :                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
    2210                 :            :         else
    2211                 :     205444 :                 page = __alloc_pages_nodemask(gfp, order,
    2212                 :            :                                 policy_node(gfp, pol, numa_node_id()),
    2213                 :            :                                 policy_nodemask(gfp, pol));
    2214                 :            : 
    2215                 :     224186 :         return page;
    2216                 :            : }
    2217                 :            : EXPORT_SYMBOL(alloc_pages_current);
    2218                 :            : 
    2219                 :     206830 : int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
    2220                 :            : {
    2221         [ -  + ]:     206830 :         struct mempolicy *pol = mpol_dup(vma_policy(src));
    2222                 :            : 
    2223         [ -  + ]:     206830 :         if (IS_ERR(pol))
    2224                 :          0 :                 return PTR_ERR(pol);
    2225                 :     206830 :         dst->vm_policy = pol;
    2226                 :     206830 :         return 0;
    2227                 :            : }
    2228                 :            : 
    2229                 :            : /*
    2230                 :            :  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
    2231                 :            :  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
    2232                 :            :  * with the mems_allowed returned by cpuset_mems_allowed().  This
    2233                 :            :  * keeps mempolicies cpuset relative after its cpuset moves.  See
    2234                 :            :  * further kernel/cpuset.c update_nodemask().
    2235                 :            :  *
    2236                 :            :  * current's mempolicy may be rebinded by the other task(the task that changes
    2237                 :            :  * cpuset's mems), so we needn't do rebind work for current task.
    2238                 :            :  */
    2239                 :            : 
    2240                 :            : /* Slow path of a mempolicy duplicate */
    2241                 :          3 : struct mempolicy *__mpol_dup(struct mempolicy *old)
    2242                 :            : {
    2243                 :          3 :         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
    2244                 :            : 
    2245         [ +  - ]:          3 :         if (!new)
    2246                 :            :                 return ERR_PTR(-ENOMEM);
    2247                 :            : 
    2248                 :            :         /* task's mempolicy is protected by alloc_lock */
    2249         [ +  - ]:          3 :         if (old == current->mempolicy) {
    2250                 :          3 :                 task_lock(current);
    2251                 :          3 :                 *new = *old;
    2252                 :          3 :                 task_unlock(current);
    2253                 :            :         } else
    2254                 :          0 :                 *new = *old;
    2255                 :            : 
    2256         [ -  + ]:          3 :         if (current_cpuset_is_being_rebound()) {
    2257                 :          0 :                 nodemask_t mems = cpuset_mems_allowed(current);
    2258                 :          0 :                 mpol_rebind_policy(new, &mems);
    2259                 :            :         }
    2260                 :          3 :         atomic_set(&new->refcnt, 1);
    2261                 :          3 :         return new;
    2262                 :            : }
    2263                 :            : 
    2264                 :            : /* Slow path of a mempolicy comparison */
    2265                 :          0 : bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
    2266                 :            : {
    2267         [ #  # ]:          0 :         if (!a || !b)
    2268                 :            :                 return false;
    2269         [ #  # ]:          0 :         if (a->mode != b->mode)
    2270                 :            :                 return false;
    2271         [ #  # ]:          0 :         if (a->flags != b->flags)
    2272                 :            :                 return false;
    2273         [ #  # ]:          0 :         if (mpol_store_user_nodemask(a))
    2274         [ #  # ]:          0 :                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
    2275                 :            :                         return false;
    2276                 :            : 
    2277      [ #  #  # ]:          0 :         switch (a->mode) {
    2278                 :          0 :         case MPOL_BIND:
    2279                 :            :                 /* Fall through */
    2280                 :            :         case MPOL_INTERLEAVE:
    2281                 :          0 :                 return !!nodes_equal(a->v.nodes, b->v.nodes);
    2282                 :          0 :         case MPOL_PREFERRED:
    2283                 :            :                 /* a's ->flags is the same as b's */
    2284         [ #  # ]:          0 :                 if (a->flags & MPOL_F_LOCAL)
    2285                 :            :                         return true;
    2286                 :          0 :                 return a->v.preferred_node == b->v.preferred_node;
    2287                 :          0 :         default:
    2288                 :          0 :                 BUG();
    2289                 :            :                 return false;
    2290                 :            :         }
    2291                 :            : }
    2292                 :            : 
    2293                 :            : /*
    2294                 :            :  * Shared memory backing store policy support.
    2295                 :            :  *
    2296                 :            :  * Remember policies even when nobody has shared memory mapped.
    2297                 :            :  * The policies are kept in Red-Black tree linked from the inode.
    2298                 :            :  * They are protected by the sp->lock rwlock, which should be held
    2299                 :            :  * for any accesses to the tree.
    2300                 :            :  */
    2301                 :            : 
    2302                 :            : /*
    2303                 :            :  * lookup first element intersecting start-end.  Caller holds sp->lock for
    2304                 :            :  * reading or for writing
    2305                 :            :  */
    2306                 :            : static struct sp_node *
    2307                 :            : sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
    2308                 :            : {
    2309                 :            :         struct rb_node *n = sp->root.rb_node;
    2310                 :            : 
    2311                 :            :         while (n) {
    2312                 :            :                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
    2313                 :            : 
    2314                 :            :                 if (start >= p->end)
    2315                 :            :                         n = n->rb_right;
    2316                 :            :                 else if (end <= p->start)
    2317                 :            :                         n = n->rb_left;
    2318                 :            :                 else
    2319                 :            :                         break;
    2320                 :            :         }
    2321                 :            :         if (!n)
    2322                 :            :                 return NULL;
    2323                 :            :         for (;;) {
    2324                 :            :                 struct sp_node *w = NULL;
    2325                 :            :                 struct rb_node *prev = rb_prev(n);
    2326                 :            :                 if (!prev)
    2327                 :            :                         break;
    2328                 :            :                 w = rb_entry(prev, struct sp_node, nd);
    2329                 :            :                 if (w->end <= start)
    2330                 :            :                         break;
    2331                 :            :                 n = prev;
    2332                 :            :         }
    2333                 :            :         return rb_entry(n, struct sp_node, nd);
    2334                 :            : }
    2335                 :            : 
    2336                 :            : /*
    2337                 :            :  * Insert a new shared policy into the list.  Caller holds sp->lock for
    2338                 :            :  * writing.
    2339                 :            :  */
    2340                 :          0 : static void sp_insert(struct shared_policy *sp, struct sp_node *new)
    2341                 :            : {
    2342                 :          0 :         struct rb_node **p = &sp->root.rb_node;
    2343                 :          0 :         struct rb_node *parent = NULL;
    2344                 :          0 :         struct sp_node *nd;
    2345                 :            : 
    2346                 :          0 :         while (*p) {
    2347                 :          0 :                 parent = *p;
    2348                 :          0 :                 nd = rb_entry(parent, struct sp_node, nd);
    2349         [ #  # ]:          0 :                 if (new->start < nd->start)
    2350                 :          0 :                         p = &(*p)->rb_left;
    2351         [ #  # ]:          0 :                 else if (new->end > nd->end)
    2352                 :          0 :                         p = &(*p)->rb_right;
    2353                 :            :                 else
    2354         [ #  # ]:          0 :                         BUG();
    2355                 :            :         }
    2356                 :          0 :         rb_link_node(&new->nd, parent, p);
    2357                 :          0 :         rb_insert_color(&new->nd, &sp->root);
    2358                 :          0 :         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
    2359                 :            :                  new->policy ? new->policy->mode : 0);
    2360                 :          0 : }
    2361                 :            : 
    2362                 :            : /* Find shared policy intersecting idx */
    2363                 :            : struct mempolicy *
    2364                 :       3951 : mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
    2365                 :            : {
    2366                 :       3951 :         struct mempolicy *pol = NULL;
    2367                 :       3951 :         struct sp_node *sn;
    2368                 :            : 
    2369         [ -  + ]:       3951 :         if (!sp->root.rb_node)
    2370                 :            :                 return NULL;
    2371                 :          0 :         read_lock(&sp->lock);
    2372                 :          0 :         sn = sp_lookup(sp, idx, idx+1);
    2373         [ #  # ]:          0 :         if (sn) {
    2374         [ #  # ]:          0 :                 mpol_get(sn->policy);
    2375                 :          0 :                 pol = sn->policy;
    2376                 :            :         }
    2377                 :          0 :         read_unlock(&sp->lock);
    2378                 :          0 :         return pol;
    2379                 :            : }
    2380                 :            : 
    2381                 :          0 : static void sp_free(struct sp_node *n)
    2382                 :            : {
    2383         [ #  # ]:          0 :         mpol_put(n->policy);
    2384                 :          0 :         kmem_cache_free(sn_cache, n);
    2385                 :          0 : }
    2386                 :            : 
    2387                 :            : /**
    2388                 :            :  * mpol_misplaced - check whether current page node is valid in policy
    2389                 :            :  *
    2390                 :            :  * @page: page to be checked
    2391                 :            :  * @vma: vm area where page mapped
    2392                 :            :  * @addr: virtual address where page mapped
    2393                 :            :  *
    2394                 :            :  * Lookup current policy node id for vma,addr and "compare to" page's
    2395                 :            :  * node id.
    2396                 :            :  *
    2397                 :            :  * Returns:
    2398                 :            :  *      -1      - not misplaced, page is in the right node
    2399                 :            :  *      node    - node id where the page should be
    2400                 :            :  *
    2401                 :            :  * Policy determination "mimics" alloc_page_vma().
    2402                 :            :  * Called from fault path where we know the vma and faulting address.
    2403                 :            :  */
    2404                 :          0 : int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
    2405                 :            : {
    2406                 :          0 :         struct mempolicy *pol;
    2407                 :          0 :         struct zoneref *z;
    2408                 :          0 :         int curnid = page_to_nid(page);
    2409                 :          0 :         unsigned long pgoff;
    2410                 :          0 :         int thiscpu = raw_smp_processor_id();
    2411                 :          0 :         int thisnid = cpu_to_node(thiscpu);
    2412                 :          0 :         int polnid = NUMA_NO_NODE;
    2413                 :          0 :         int ret = -1;
    2414                 :            : 
    2415                 :          0 :         pol = get_vma_policy(vma, addr);
    2416         [ #  # ]:          0 :         if (!(pol->flags & MPOL_F_MOF))
    2417                 :          0 :                 goto out;
    2418                 :            : 
    2419   [ #  #  #  # ]:          0 :         switch (pol->mode) {
    2420                 :          0 :         case MPOL_INTERLEAVE:
    2421                 :          0 :                 pgoff = vma->vm_pgoff;
    2422                 :          0 :                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
    2423                 :          0 :                 polnid = offset_il_node(pol, pgoff);
    2424                 :          0 :                 break;
    2425                 :            : 
    2426                 :          0 :         case MPOL_PREFERRED:
    2427         [ #  # ]:          0 :                 if (pol->flags & MPOL_F_LOCAL)
    2428                 :          0 :                         polnid = numa_node_id();
    2429                 :            :                 else
    2430                 :          0 :                         polnid = pol->v.preferred_node;
    2431                 :            :                 break;
    2432                 :            : 
    2433                 :          0 :         case MPOL_BIND:
    2434                 :            : 
    2435                 :            :                 /*
    2436                 :            :                  * allows binding to multiple nodes.
    2437                 :            :                  * use current page if in policy nodemask,
    2438                 :            :                  * else select nearest allowed node, if any.
    2439                 :            :                  * If no allowed nodes, use current [!misplaced].
    2440                 :            :                  */
    2441         [ #  # ]:          0 :                 if (node_isset(curnid, pol->v.nodes))
    2442                 :          0 :                         goto out;
    2443         [ #  # ]:          0 :                 z = first_zones_zonelist(
    2444                 :            :                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
    2445                 :            :                                 gfp_zone(GFP_HIGHUSER),
    2446                 :            :                                 &pol->v.nodes);
    2447                 :          0 :                 polnid = zone_to_nid(z->zone);
    2448                 :          0 :                 break;
    2449                 :            : 
    2450                 :          0 :         default:
    2451                 :          0 :                 BUG();
    2452                 :            :         }
    2453                 :            : 
    2454                 :            :         /* Migrate the page towards the node whose CPU is referencing it */
    2455         [ #  # ]:          0 :         if (pol->flags & MPOL_F_MORON) {
    2456                 :          0 :                 polnid = thisnid;
    2457                 :            : 
    2458                 :          0 :                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
    2459                 :            :                         goto out;
    2460                 :            :         }
    2461                 :            : 
    2462         [ #  # ]:          0 :         if (curnid != polnid)
    2463                 :          0 :                 ret = polnid;
    2464                 :          0 : out:
    2465         [ #  # ]:          0 :         mpol_cond_put(pol);
    2466                 :            : 
    2467                 :          0 :         return ret;
    2468                 :            : }
    2469                 :            : 
    2470                 :            : /*
    2471                 :            :  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
    2472                 :            :  * dropped after task->mempolicy is set to NULL so that any allocation done as
    2473                 :            :  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
    2474                 :            :  * policy.
    2475                 :            :  */
    2476                 :       2706 : void mpol_put_task_policy(struct task_struct *task)
    2477                 :            : {
    2478                 :       2706 :         struct mempolicy *pol;
    2479                 :            : 
    2480                 :       2706 :         task_lock(task);
    2481                 :       2706 :         pol = task->mempolicy;
    2482                 :       2706 :         task->mempolicy = NULL;
    2483                 :       2706 :         task_unlock(task);
    2484         [ -  + ]:       2706 :         mpol_put(pol);
    2485                 :       2706 : }
    2486                 :            : 
    2487                 :          0 : static void sp_delete(struct shared_policy *sp, struct sp_node *n)
    2488                 :            : {
    2489                 :          0 :         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
    2490                 :          0 :         rb_erase(&n->nd, &sp->root);
    2491                 :          0 :         sp_free(n);
    2492                 :          0 : }
    2493                 :            : 
    2494                 :          0 : static void sp_node_init(struct sp_node *node, unsigned long start,
    2495                 :            :                         unsigned long end, struct mempolicy *pol)
    2496                 :            : {
    2497                 :          0 :         node->start = start;
    2498                 :          0 :         node->end = end;
    2499                 :          0 :         node->policy = pol;
    2500                 :            : }
    2501                 :            : 
    2502                 :          0 : static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
    2503                 :            :                                 struct mempolicy *pol)
    2504                 :            : {
    2505                 :          0 :         struct sp_node *n;
    2506                 :          0 :         struct mempolicy *newpol;
    2507                 :            : 
    2508                 :          0 :         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
    2509         [ #  # ]:          0 :         if (!n)
    2510                 :            :                 return NULL;
    2511                 :            : 
    2512         [ #  # ]:          0 :         newpol = mpol_dup(pol);
    2513         [ #  # ]:          0 :         if (IS_ERR(newpol)) {
    2514                 :          0 :                 kmem_cache_free(sn_cache, n);
    2515                 :          0 :                 return NULL;
    2516                 :            :         }
    2517                 :          0 :         newpol->flags |= MPOL_F_SHARED;
    2518                 :          0 :         sp_node_init(n, start, end, newpol);
    2519                 :            : 
    2520                 :          0 :         return n;
    2521                 :            : }
    2522                 :            : 
    2523                 :            : /* Replace a policy range. */
    2524                 :          0 : static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
    2525                 :            :                                  unsigned long end, struct sp_node *new)
    2526                 :            : {
    2527                 :          0 :         struct sp_node *n;
    2528                 :          0 :         struct sp_node *n_new = NULL;
    2529                 :          0 :         struct mempolicy *mpol_new = NULL;
    2530                 :          0 :         int ret = 0;
    2531                 :            : 
    2532                 :          0 : restart:
    2533                 :          0 :         write_lock(&sp->lock);
    2534                 :          0 :         n = sp_lookup(sp, start, end);
    2535                 :            :         /* Take care of old policies in the same range. */
    2536   [ #  #  #  # ]:          0 :         while (n && n->start < end) {
    2537                 :          0 :                 struct rb_node *next = rb_next(&n->nd);
    2538         [ #  # ]:          0 :                 if (n->start >= start) {
    2539         [ #  # ]:          0 :                         if (n->end <= end)
    2540                 :          0 :                                 sp_delete(sp, n);
    2541                 :            :                         else
    2542                 :          0 :                                 n->start = end;
    2543                 :            :                 } else {
    2544                 :            :                         /* Old policy spanning whole new range. */
    2545         [ #  # ]:          0 :                         if (n->end > end) {
    2546         [ #  # ]:          0 :                                 if (!n_new)
    2547                 :          0 :                                         goto alloc_new;
    2548                 :            : 
    2549                 :          0 :                                 *mpol_new = *n->policy;
    2550                 :          0 :                                 atomic_set(&mpol_new->refcnt, 1);
    2551                 :          0 :                                 sp_node_init(n_new, end, n->end, mpol_new);
    2552                 :          0 :                                 n->end = start;
    2553                 :          0 :                                 sp_insert(sp, n_new);
    2554                 :          0 :                                 n_new = NULL;
    2555                 :          0 :                                 mpol_new = NULL;
    2556                 :          0 :                                 break;
    2557                 :            :                         } else
    2558                 :          0 :                                 n->end = start;
    2559                 :            :                 }
    2560         [ #  # ]:          0 :                 if (!next)
    2561                 :            :                         break;
    2562                 :            :                 n = rb_entry(next, struct sp_node, nd);
    2563                 :            :         }
    2564         [ #  # ]:          0 :         if (new)
    2565                 :          0 :                 sp_insert(sp, new);
    2566                 :          0 :         write_unlock(&sp->lock);
    2567                 :          0 :         ret = 0;
    2568                 :            : 
    2569                 :          0 : err_out:
    2570         [ #  # ]:          0 :         if (mpol_new)
    2571                 :          0 :                 mpol_put(mpol_new);
    2572         [ #  # ]:          0 :         if (n_new)
    2573                 :          0 :                 kmem_cache_free(sn_cache, n_new);
    2574                 :            : 
    2575                 :          0 :         return ret;
    2576                 :            : 
    2577                 :            : alloc_new:
    2578                 :          0 :         write_unlock(&sp->lock);
    2579                 :          0 :         ret = -ENOMEM;
    2580                 :          0 :         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
    2581         [ #  # ]:          0 :         if (!n_new)
    2582                 :          0 :                 goto err_out;
    2583                 :          0 :         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
    2584         [ #  # ]:          0 :         if (!mpol_new)
    2585                 :          0 :                 goto err_out;
    2586                 :          0 :         goto restart;
    2587                 :            : }
    2588                 :            : 
    2589                 :            : /**
    2590                 :            :  * mpol_shared_policy_init - initialize shared policy for inode
    2591                 :            :  * @sp: pointer to inode shared policy
    2592                 :            :  * @mpol:  struct mempolicy to install
    2593                 :            :  *
    2594                 :            :  * Install non-NULL @mpol in inode's shared policy rb-tree.
    2595                 :            :  * On entry, the current task has a reference on a non-NULL @mpol.
    2596                 :            :  * This must be released on exit.
    2597                 :            :  * This is called at get_inode() calls and we can use GFP_KERNEL.
    2598                 :            :  */
    2599                 :       1620 : void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
    2600                 :            : {
    2601                 :       1620 :         int ret;
    2602                 :            : 
    2603                 :       1620 :         sp->root = RB_ROOT;          /* empty tree == default mempolicy */
    2604                 :       1620 :         rwlock_init(&sp->lock);
    2605                 :            : 
    2606         [ -  + ]:       1620 :         if (mpol) {
    2607                 :          0 :                 struct vm_area_struct pvma;
    2608                 :          0 :                 struct mempolicy *new;
    2609                 :          0 :                 NODEMASK_SCRATCH(scratch);
    2610                 :            : 
    2611                 :          0 :                 if (!scratch)
    2612                 :            :                         goto put_mpol;
    2613                 :            :                 /* contextualize the tmpfs mount point mempolicy */
    2614                 :          0 :                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
    2615         [ #  # ]:          0 :                 if (IS_ERR(new))
    2616                 :          0 :                         goto free_scratch; /* no valid nodemask intersection */
    2617                 :            : 
    2618                 :          0 :                 task_lock(current);
    2619                 :          0 :                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
    2620                 :          0 :                 task_unlock(current);
    2621         [ #  # ]:          0 :                 if (ret)
    2622                 :          0 :                         goto put_new;
    2623                 :            : 
    2624                 :            :                 /* Create pseudo-vma that contains just the policy */
    2625                 :          0 :                 vma_init(&pvma, NULL);
    2626   [ #  #  #  # ]:          0 :                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
    2627                 :          0 :                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
    2628                 :            : 
    2629                 :          0 : put_new:
    2630         [ #  # ]:          0 :                 mpol_put(new);                  /* drop initial ref */
    2631                 :          0 : free_scratch:
    2632                 :          0 :                 NODEMASK_SCRATCH_FREE(scratch);
    2633                 :          0 : put_mpol:
    2634                 :          0 :                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
    2635                 :            :         }
    2636                 :       1620 : }
    2637                 :            : 
    2638                 :          0 : int mpol_set_shared_policy(struct shared_policy *info,
    2639                 :            :                         struct vm_area_struct *vma, struct mempolicy *npol)
    2640                 :            : {
    2641                 :          0 :         int err;
    2642                 :          0 :         struct sp_node *new = NULL;
    2643         [ #  # ]:          0 :         unsigned long sz = vma_pages(vma);
    2644                 :            : 
    2645                 :          0 :         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
    2646                 :            :                  vma->vm_pgoff,
    2647                 :            :                  sz, npol ? npol->mode : -1,
    2648                 :            :                  npol ? npol->flags : -1,
    2649                 :            :                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
    2650                 :            : 
    2651         [ #  # ]:          0 :         if (npol) {
    2652                 :          0 :                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
    2653         [ #  # ]:          0 :                 if (!new)
    2654                 :            :                         return -ENOMEM;
    2655                 :            :         }
    2656                 :          0 :         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
    2657         [ #  # ]:          0 :         if (err && new)
    2658                 :          0 :                 sp_free(new);
    2659                 :            :         return err;
    2660                 :            : }
    2661                 :            : 
    2662                 :            : /* Free a backing policy store on inode delete. */
    2663                 :        321 : void mpol_free_shared_policy(struct shared_policy *p)
    2664                 :            : {
    2665                 :        321 :         struct sp_node *n;
    2666                 :        321 :         struct rb_node *next;
    2667                 :            : 
    2668         [ -  + ]:        321 :         if (!p->root.rb_node)
    2669                 :            :                 return;
    2670                 :          0 :         write_lock(&p->lock);
    2671                 :          0 :         next = rb_first(&p->root);
    2672         [ #  # ]:          0 :         while (next) {
    2673                 :          0 :                 n = rb_entry(next, struct sp_node, nd);
    2674                 :          0 :                 next = rb_next(&n->nd);
    2675                 :          0 :                 sp_delete(p, n);
    2676                 :            :         }
    2677                 :          0 :         write_unlock(&p->lock);
    2678                 :            : }
    2679                 :            : 
    2680                 :            : #ifdef CONFIG_NUMA_BALANCING
    2681                 :            : static int __initdata numabalancing_override;
    2682                 :            : 
    2683                 :            : static void __init check_numabalancing_enable(void)
    2684                 :            : {
    2685                 :            :         bool numabalancing_default = false;
    2686                 :            : 
    2687                 :            :         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
    2688                 :            :                 numabalancing_default = true;
    2689                 :            : 
    2690                 :            :         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
    2691                 :            :         if (numabalancing_override)
    2692                 :            :                 set_numabalancing_state(numabalancing_override == 1);
    2693                 :            : 
    2694                 :            :         if (num_online_nodes() > 1 && !numabalancing_override) {
    2695                 :            :                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
    2696                 :            :                         numabalancing_default ? "Enabling" : "Disabling");
    2697                 :            :                 set_numabalancing_state(numabalancing_default);
    2698                 :            :         }
    2699                 :            : }
    2700                 :            : 
    2701                 :            : static int __init setup_numabalancing(char *str)
    2702                 :            : {
    2703                 :            :         int ret = 0;
    2704                 :            :         if (!str)
    2705                 :            :                 goto out;
    2706                 :            : 
    2707                 :            :         if (!strcmp(str, "enable")) {
    2708                 :            :                 numabalancing_override = 1;
    2709                 :            :                 ret = 1;
    2710                 :            :         } else if (!strcmp(str, "disable")) {
    2711                 :            :                 numabalancing_override = -1;
    2712                 :            :                 ret = 1;
    2713                 :            :         }
    2714                 :            : out:
    2715                 :            :         if (!ret)
    2716                 :            :                 pr_warn("Unable to parse numa_balancing=\n");
    2717                 :            : 
    2718                 :            :         return ret;
    2719                 :            : }
    2720                 :            : __setup("numa_balancing=", setup_numabalancing);
    2721                 :            : #else
    2722                 :          3 : static inline void __init check_numabalancing_enable(void)
    2723                 :            : {
    2724                 :          3 : }
    2725                 :            : #endif /* CONFIG_NUMA_BALANCING */
    2726                 :            : 
    2727                 :            : /* assumes fs == KERNEL_DS */
    2728                 :          3 : void __init numa_policy_init(void)
    2729                 :            : {
    2730                 :          3 :         nodemask_t interleave_nodes;
    2731                 :          3 :         unsigned long largest = 0;
    2732                 :          3 :         int nid, prefer = 0;
    2733                 :            : 
    2734                 :          3 :         policy_cache = kmem_cache_create("numa_policy",
    2735                 :            :                                          sizeof(struct mempolicy),
    2736                 :            :                                          0, SLAB_PANIC, NULL);
    2737                 :            : 
    2738                 :          3 :         sn_cache = kmem_cache_create("shared_policy_node",
    2739                 :            :                                      sizeof(struct sp_node),
    2740                 :            :                                      0, SLAB_PANIC, NULL);
    2741                 :            : 
    2742         [ +  + ]:          9 :         for_each_node(nid) {
    2743                 :          3 :                 preferred_node_policy[nid] = (struct mempolicy) {
    2744                 :            :                         .refcnt = ATOMIC_INIT(1),
    2745                 :            :                         .mode = MPOL_PREFERRED,
    2746                 :            :                         .flags = MPOL_F_MOF | MPOL_F_MORON,
    2747                 :            :                         .v = { .preferred_node = nid, },
    2748                 :            :                 };
    2749                 :            :         }
    2750                 :            : 
    2751                 :            :         /*
    2752                 :            :          * Set interleaving policy for system init. Interleaving is only
    2753                 :            :          * enabled across suitably sized nodes (default is >= 16MB), or
    2754                 :            :          * fall back to the largest node if they're all smaller.
    2755                 :            :          */
    2756                 :          3 :         nodes_clear(interleave_nodes);
    2757         [ +  + ]:         12 :         for_each_node_state(nid, N_MEMORY) {
    2758                 :          3 :                 unsigned long total_pages = node_present_pages(nid);
    2759                 :            : 
    2760                 :            :                 /* Preserve the largest node */
    2761         [ +  - ]:          3 :                 if (largest < total_pages) {
    2762                 :          3 :                         largest = total_pages;
    2763                 :          3 :                         prefer = nid;
    2764                 :            :                 }
    2765                 :            : 
    2766                 :            :                 /* Interleave this node? */
    2767         [ +  - ]:          3 :                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
    2768                 :          3 :                         node_set(nid, interleave_nodes);
    2769                 :            :         }
    2770                 :            : 
    2771                 :            :         /* All too small, use the largest */
    2772         [ -  + ]:          3 :         if (unlikely(nodes_empty(interleave_nodes)))
    2773                 :          0 :                 node_set(prefer, interleave_nodes);
    2774                 :            : 
    2775         [ -  + ]:          3 :         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
    2776                 :          0 :                 pr_err("%s: interleaving failed\n", __func__);
    2777                 :            : 
    2778                 :          3 :         check_numabalancing_enable();
    2779                 :          3 : }
    2780                 :            : 
    2781                 :            : /* Reset policy of current process to default */
    2782                 :          6 : void numa_default_policy(void)
    2783                 :            : {
    2784                 :          6 :         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
    2785                 :          6 : }
    2786                 :            : 
    2787                 :            : /*
    2788                 :            :  * Parse and format mempolicy from/to strings
    2789                 :            :  */
    2790                 :            : 
    2791                 :            : /*
    2792                 :            :  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
    2793                 :            :  */
    2794                 :            : static const char * const policy_modes[] =
    2795                 :            : {
    2796                 :            :         [MPOL_DEFAULT]    = "default",
    2797                 :            :         [MPOL_PREFERRED]  = "prefer",
    2798                 :            :         [MPOL_BIND]       = "bind",
    2799                 :            :         [MPOL_INTERLEAVE] = "interleave",
    2800                 :            :         [MPOL_LOCAL]      = "local",
    2801                 :            : };
    2802                 :            : 
    2803                 :            : 
    2804                 :            : #ifdef CONFIG_TMPFS
    2805                 :            : /**
    2806                 :            :  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
    2807                 :            :  * @str:  string containing mempolicy to parse
    2808                 :            :  * @mpol:  pointer to struct mempolicy pointer, returned on success.
    2809                 :            :  *
    2810                 :            :  * Format of input:
    2811                 :            :  *      <mode>[=<flags>][:<nodelist>]
    2812                 :            :  *
    2813                 :            :  * On success, returns 0, else 1
    2814                 :            :  */
    2815                 :          0 : int mpol_parse_str(char *str, struct mempolicy **mpol)
    2816                 :            : {
    2817                 :          0 :         struct mempolicy *new = NULL;
    2818                 :          0 :         unsigned short mode_flags;
    2819                 :          0 :         nodemask_t nodes;
    2820                 :          0 :         char *nodelist = strchr(str, ':');
    2821                 :          0 :         char *flags = strchr(str, '=');
    2822                 :          0 :         int err = 1, mode;
    2823                 :            : 
    2824         [ #  # ]:          0 :         if (flags)
    2825                 :          0 :                 *flags++ = '\0';        /* terminate mode string */
    2826                 :            : 
    2827         [ #  # ]:          0 :         if (nodelist) {
    2828                 :            :                 /* NUL-terminate mode or flags string */
    2829                 :          0 :                 *nodelist++ = '\0';
    2830         [ #  # ]:          0 :                 if (nodelist_parse(nodelist, nodes))
    2831                 :          0 :                         goto out;
    2832         [ #  # ]:          0 :                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
    2833                 :          0 :                         goto out;
    2834                 :            :         } else
    2835                 :          0 :                 nodes_clear(nodes);
    2836                 :            : 
    2837                 :          0 :         mode = match_string(policy_modes, MPOL_MAX, str);
    2838         [ #  # ]:          0 :         if (mode < 0)
    2839                 :          0 :                 goto out;
    2840                 :            : 
    2841   [ #  #  #  #  :          0 :         switch (mode) {
                   #  # ]
    2842                 :          0 :         case MPOL_PREFERRED:
    2843                 :            :                 /*
    2844                 :            :                  * Insist on a nodelist of one node only
    2845                 :            :                  */
    2846         [ #  # ]:          0 :                 if (nodelist) {
    2847                 :            :                         char *rest = nodelist;
    2848         [ #  # ]:          0 :                         while (isdigit(*rest))
    2849                 :          0 :                                 rest++;
    2850         [ #  # ]:          0 :                         if (*rest)
    2851                 :          0 :                                 goto out;
    2852                 :            :                 }
    2853                 :            :                 break;
    2854                 :          0 :         case MPOL_INTERLEAVE:
    2855                 :            :                 /*
    2856                 :            :                  * Default to online nodes with memory if no nodelist
    2857                 :            :                  */
    2858         [ #  # ]:          0 :                 if (!nodelist)
    2859                 :          0 :                         nodes = node_states[N_MEMORY];
    2860                 :            :                 break;
    2861                 :          0 :         case MPOL_LOCAL:
    2862                 :            :                 /*
    2863                 :            :                  * Don't allow a nodelist;  mpol_new() checks flags
    2864                 :            :                  */
    2865         [ #  # ]:          0 :                 if (nodelist)
    2866                 :          0 :                         goto out;
    2867                 :            :                 mode = MPOL_PREFERRED;
    2868                 :            :                 break;
    2869                 :          0 :         case MPOL_DEFAULT:
    2870                 :            :                 /*
    2871                 :            :                  * Insist on a empty nodelist
    2872                 :            :                  */
    2873         [ #  # ]:          0 :                 if (!nodelist)
    2874                 :          0 :                         err = 0;
    2875                 :          0 :                 goto out;
    2876                 :          0 :         case MPOL_BIND:
    2877                 :            :                 /*
    2878                 :            :                  * Insist on a nodelist
    2879                 :            :                  */
    2880         [ #  # ]:          0 :                 if (!nodelist)
    2881                 :          0 :                         goto out;
    2882                 :            :         }
    2883                 :            : 
    2884                 :          0 :         mode_flags = 0;
    2885         [ #  # ]:          0 :         if (flags) {
    2886                 :            :                 /*
    2887                 :            :                  * Currently, we only support two mutually exclusive
    2888                 :            :                  * mode flags.
    2889                 :            :                  */
    2890         [ #  # ]:          0 :                 if (!strcmp(flags, "static"))
    2891                 :            :                         mode_flags |= MPOL_F_STATIC_NODES;
    2892         [ #  # ]:          0 :                 else if (!strcmp(flags, "relative"))
    2893                 :            :                         mode_flags |= MPOL_F_RELATIVE_NODES;
    2894                 :            :                 else
    2895                 :          0 :                         goto out;
    2896                 :            :         }
    2897                 :            : 
    2898                 :          0 :         new = mpol_new(mode, mode_flags, &nodes);
    2899         [ #  # ]:          0 :         if (IS_ERR(new))
    2900                 :          0 :                 goto out;
    2901                 :            : 
    2902                 :            :         /*
    2903                 :            :          * Save nodes for mpol_to_str() to show the tmpfs mount options
    2904                 :            :          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
    2905                 :            :          */
    2906         [ #  # ]:          0 :         if (mode != MPOL_PREFERRED)
    2907                 :          0 :                 new->v.nodes = nodes;
    2908         [ #  # ]:          0 :         else if (nodelist)
    2909                 :          0 :                 new->v.preferred_node = first_node(nodes);
    2910                 :            :         else
    2911                 :          0 :                 new->flags |= MPOL_F_LOCAL;
    2912                 :            : 
    2913                 :            :         /*
    2914                 :            :          * Save nodes for contextualization: this will be used to "clone"
    2915                 :            :          * the mempolicy in a specific context [cpuset] at a later time.
    2916                 :            :          */
    2917                 :          0 :         new->w.user_nodemask = nodes;
    2918                 :            : 
    2919                 :          0 :         err = 0;
    2920                 :            : 
    2921                 :          0 : out:
    2922                 :            :         /* Restore string for error message */
    2923         [ #  # ]:          0 :         if (nodelist)
    2924                 :          0 :                 *--nodelist = ':';
    2925         [ #  # ]:          0 :         if (flags)
    2926                 :          0 :                 *--flags = '=';
    2927         [ #  # ]:          0 :         if (!err)
    2928                 :          0 :                 *mpol = new;
    2929                 :          0 :         return err;
    2930                 :            : }
    2931                 :            : #endif /* CONFIG_TMPFS */
    2932                 :            : 
    2933                 :            : /**
    2934                 :            :  * mpol_to_str - format a mempolicy structure for printing
    2935                 :            :  * @buffer:  to contain formatted mempolicy string
    2936                 :            :  * @maxlen:  length of @buffer
    2937                 :            :  * @pol:  pointer to mempolicy to be formatted
    2938                 :            :  *
    2939                 :            :  * Convert @pol into a string.  If @buffer is too short, truncate the string.
    2940                 :            :  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
    2941                 :            :  * longest flag, "relative", and to display at least a few node ids.
    2942                 :            :  */
    2943                 :          0 : void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
    2944                 :            : {
    2945                 :          0 :         char *p = buffer;
    2946                 :          0 :         nodemask_t nodes = NODE_MASK_NONE;
    2947                 :          0 :         unsigned short mode = MPOL_DEFAULT;
    2948                 :          0 :         unsigned short flags = 0;
    2949                 :            : 
    2950   [ #  #  #  #  :          0 :         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                   #  # ]
    2951                 :          0 :                 mode = pol->mode;
    2952                 :          0 :                 flags = pol->flags;
    2953                 :            :         }
    2954                 :            : 
    2955   [ #  #  #  # ]:          0 :         switch (mode) {
    2956                 :            :         case MPOL_DEFAULT:
    2957                 :            :                 break;
    2958                 :          0 :         case MPOL_PREFERRED:
    2959         [ #  # ]:          0 :                 if (flags & MPOL_F_LOCAL)
    2960                 :            :                         mode = MPOL_LOCAL;
    2961                 :            :                 else
    2962                 :          0 :                         node_set(pol->v.preferred_node, nodes);
    2963                 :            :                 break;
    2964                 :          0 :         case MPOL_BIND:
    2965                 :            :         case MPOL_INTERLEAVE:
    2966                 :          0 :                 nodes = pol->v.nodes;
    2967                 :          0 :                 break;
    2968                 :            :         default:
    2969                 :          0 :                 WARN_ON_ONCE(1);
    2970                 :          0 :                 snprintf(p, maxlen, "unknown");
    2971                 :          0 :                 return;
    2972                 :            :         }
    2973                 :            : 
    2974                 :          0 :         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
    2975                 :            : 
    2976         [ #  # ]:          0 :         if (flags & MPOL_MODE_FLAGS) {
    2977                 :          0 :                 p += snprintf(p, buffer + maxlen - p, "=");
    2978                 :            : 
    2979                 :            :                 /*
    2980                 :            :                  * Currently, the only defined flags are mutually exclusive
    2981                 :            :                  */
    2982         [ #  # ]:          0 :                 if (flags & MPOL_F_STATIC_NODES)
    2983                 :          0 :                         p += snprintf(p, buffer + maxlen - p, "static");
    2984                 :          0 :                 else if (flags & MPOL_F_RELATIVE_NODES)
    2985                 :          0 :                         p += snprintf(p, buffer + maxlen - p, "relative");
    2986                 :            :         }
    2987                 :            : 
    2988         [ #  # ]:          0 :         if (!nodes_empty(nodes))
    2989                 :          0 :                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
    2990                 :            :                                nodemask_pr_args(&nodes));
    2991                 :            : }

Generated by: LCOV version 1.14