LCOV - combined.info - mm/hugetlb.c

LCOV - code coverage report

Current view:	top level - mm - hugetlb.c (source / functions)		Hit	Total	Coverage
Test:	combined.info	Lines:	133	2023	6.6 %
Date:	2022-04-01 13:59:58	Functions:	15	126	11.9 %
		Branches:	51	1228	4.2 %

           Branch data     Line data    Source code

       1                 :            : // SPDX-License-Identifier: GPL-2.0-only
       2                 :            : /*
       3                 :            :  * Generic hugetlb support.
       4                 :            :  * (C) Nadia Yvette Chambers, April 2004
       5                 :            :  */
       6                 :            : #include <linux/list.h>
       7                 :            : #include <linux/init.h>
       8                 :            : #include <linux/mm.h>
       9                 :            : #include <linux/seq_file.h>
      10                 :            : #include <linux/sysctl.h>
      11                 :            : #include <linux/highmem.h>
      12                 :            : #include <linux/mmu_notifier.h>
      13                 :            : #include <linux/nodemask.h>
      14                 :            : #include <linux/pagemap.h>
      15                 :            : #include <linux/mempolicy.h>
      16                 :            : #include <linux/compiler.h>
      17                 :            : #include <linux/cpuset.h>
      18                 :            : #include <linux/mutex.h>
      19                 :            : #include <linux/memblock.h>
      20                 :            : #include <linux/sysfs.h>
      21                 :            : #include <linux/slab.h>
      22                 :            : #include <linux/mmdebug.h>
      23                 :            : #include <linux/sched/signal.h>
      24                 :            : #include <linux/rmap.h>
      25                 :            : #include <linux/string_helpers.h>
      26                 :            : #include <linux/swap.h>
      27                 :            : #include <linux/swapops.h>
      28                 :            : #include <linux/jhash.h>
      29                 :            : #include <linux/numa.h>
      30                 :            : #include <linux/llist.h>
      31                 :            : 
      32                 :            : #include <asm/page.h>
      33                 :            : #include <asm/pgtable.h>
      34                 :            : #include <asm/tlb.h>
      35                 :            : 
      36                 :            : #include <linux/io.h>
      37                 :            : #include <linux/hugetlb.h>
      38                 :            : #include <linux/hugetlb_cgroup.h>
      39                 :            : #include <linux/node.h>
      40                 :            : #include <linux/userfaultfd_k.h>
      41                 :            : #include <linux/page_owner.h>
      42                 :            : #include "internal.h"
      43                 :            : 
      44                 :            : int hugetlb_max_hstate __read_mostly;
      45                 :            : unsigned int default_hstate_idx;
      46                 :            : struct hstate hstates[HUGE_MAX_HSTATE];
      47                 :            : /*
      48                 :            :  * Minimum page order among possible hugepage sizes, set to a proper value
      49                 :            :  * at boot time.
      50                 :            :  */
      51                 :            : static unsigned int minimum_order __read_mostly = UINT_MAX;
      52                 :            : 
      53                 :            : __initdata LIST_HEAD(huge_boot_pages);
      54                 :            : 
      55                 :            : /* for command line parsing */
      56                 :            : static struct hstate * __initdata parsed_hstate;
      57                 :            : static unsigned long __initdata default_hstate_max_huge_pages;
      58                 :            : static unsigned long __initdata default_hstate_size;
      59                 :            : static bool __initdata parsed_valid_hugepagesz = true;
      60                 :            : 
      61                 :            : /*
      62                 :            :  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
      63                 :            :  * free_huge_pages, and surplus_huge_pages.
      64                 :            :  */
      65                 :            : DEFINE_SPINLOCK(hugetlb_lock);
      66                 :            : 
      67                 :            : /*
      68                 :            :  * Serializes faults on the same logical page.  This is used to
      69                 :            :  * prevent spurious OOMs when the hugepage pool is fully utilized.
      70                 :            :  */
      71                 :            : static int num_fault_mutexes;
      72                 :            : struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
      73                 :            : 
      74                 :            : /* Forward declaration */
      75                 :            : static int hugetlb_acct_memory(struct hstate *h, long delta);
      76                 :            : 
      77                 :          0 : static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
      78                 :            : {
      79   [ #  #  #  # ]:          0 :         bool free = (spool->count == 0) && (spool->used_hpages == 0);
      80                 :            : 
      81                 :          0 :         spin_unlock(&spool->lock);
      82                 :            : 
      83                 :            :         /* If no pages are used, and no other handles to the subpool
      84                 :            :          * remain, give up any reservations mased on minimum size and
      85                 :            :          * free the subpool */
      86         [ #  # ]:          0 :         if (free) {
      87         [ #  # ]:          0 :                 if (spool->min_hpages != -1)
      88                 :          0 :                         hugetlb_acct_memory(spool->hstate,
      89                 :            :                                                 -spool->min_hpages);
      90                 :          0 :                 kfree(spool);
      91                 :            :         }
      92                 :          0 : }
      93                 :            : 
      94                 :          0 : struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
      95                 :            :                                                 long min_hpages)
      96                 :            : {
      97                 :          0 :         struct hugepage_subpool *spool;
      98                 :            : 
      99                 :          0 :         spool = kzalloc(sizeof(*spool), GFP_KERNEL);
     100         [ #  # ]:          0 :         if (!spool)
     101                 :            :                 return NULL;
     102                 :            : 
     103         [ #  # ]:          0 :         spin_lock_init(&spool->lock);
     104                 :          0 :         spool->count = 1;
     105                 :          0 :         spool->max_hpages = max_hpages;
     106                 :          0 :         spool->hstate = h;
     107                 :          0 :         spool->min_hpages = min_hpages;
     108                 :            : 
     109   [ #  #  #  # ]:          0 :         if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
     110                 :          0 :                 kfree(spool);
     111                 :          0 :                 return NULL;
     112                 :            :         }
     113                 :          0 :         spool->rsv_hpages = min_hpages;
     114                 :            : 
     115                 :          0 :         return spool;
     116                 :            : }
     117                 :            : 
     118                 :          0 : void hugepage_put_subpool(struct hugepage_subpool *spool)
     119                 :            : {
     120                 :          0 :         spin_lock(&spool->lock);
     121         [ #  # ]:          0 :         BUG_ON(!spool->count);
     122                 :          0 :         spool->count--;
     123                 :          0 :         unlock_or_release_subpool(spool);
     124                 :          0 : }
     125                 :            : 
     126                 :            : /*
     127                 :            :  * Subpool accounting for allocating and reserving pages.
     128                 :            :  * Return -ENOMEM if there are not enough resources to satisfy the
     129                 :            :  * the request.  Otherwise, return the number of pages by which the
     130                 :            :  * global pools must be adjusted (upward).  The returned value may
     131                 :            :  * only be different than the passed value (delta) in the case where
     132                 :            :  * a subpool minimum size must be manitained.
     133                 :            :  */
     134                 :          0 : static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
     135                 :            :                                       long delta)
     136                 :            : {
     137                 :          0 :         long ret = delta;
     138                 :            : 
     139         [ #  # ]:          0 :         if (!spool)
     140                 :            :                 return ret;
     141                 :            : 
     142                 :          0 :         spin_lock(&spool->lock);
     143                 :            : 
     144         [ #  # ]:          0 :         if (spool->max_hpages != -1) {               /* maximum size accounting */
     145         [ #  # ]:          0 :                 if ((spool->used_hpages + delta) <= spool->max_hpages)
     146                 :          0 :                         spool->used_hpages += delta;
     147                 :            :                 else {
     148                 :          0 :                         ret = -ENOMEM;
     149                 :          0 :                         goto unlock_ret;
     150                 :            :                 }
     151                 :            :         }
     152                 :            : 
     153                 :            :         /* minimum size accounting */
     154   [ #  #  #  # ]:          0 :         if (spool->min_hpages != -1 && spool->rsv_hpages) {
     155         [ #  # ]:          0 :                 if (delta > spool->rsv_hpages) {
     156                 :            :                         /*
     157                 :            :                          * Asking for more reserves than those already taken on
     158                 :            :                          * behalf of subpool.  Return difference.
     159                 :            :                          */
     160                 :          0 :                         ret = delta - spool->rsv_hpages;
     161                 :          0 :                         spool->rsv_hpages = 0;
     162                 :            :                 } else {
     163                 :          0 :                         ret = 0;        /* reserves already accounted for */
     164                 :          0 :                         spool->rsv_hpages -= delta;
     165                 :            :                 }
     166                 :            :         }
     167                 :            : 
     168                 :          0 : unlock_ret:
     169                 :          0 :         spin_unlock(&spool->lock);
     170                 :          0 :         return ret;
     171                 :            : }
     172                 :            : 
     173                 :            : /*
     174                 :            :  * Subpool accounting for freeing and unreserving pages.
     175                 :            :  * Return the number of global page reservations that must be dropped.
     176                 :            :  * The return value may only be different than the passed value (delta)
     177                 :            :  * in the case where a subpool minimum size must be maintained.
     178                 :            :  */
     179                 :          0 : static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
     180                 :            :                                        long delta)
     181                 :            : {
     182                 :          0 :         long ret = delta;
     183                 :            : 
     184         [ #  # ]:          0 :         if (!spool)
     185                 :            :                 return delta;
     186                 :            : 
     187                 :          0 :         spin_lock(&spool->lock);
     188                 :            : 
     189         [ #  # ]:          0 :         if (spool->max_hpages != -1)         /* maximum size accounting */
     190                 :          0 :                 spool->used_hpages -= delta;
     191                 :            : 
     192                 :            :          /* minimum size accounting */
     193   [ #  #  #  # ]:          0 :         if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
     194         [ #  # ]:          0 :                 if (spool->rsv_hpages + delta <= spool->min_hpages)
     195                 :            :                         ret = 0;
     196                 :            :                 else
     197                 :          0 :                         ret = spool->rsv_hpages + delta - spool->min_hpages;
     198                 :            : 
     199                 :          0 :                 spool->rsv_hpages += delta;
     200         [ #  # ]:          0 :                 if (spool->rsv_hpages > spool->min_hpages)
     201                 :          0 :                         spool->rsv_hpages = spool->min_hpages;
     202                 :            :         }
     203                 :            : 
     204                 :            :         /*
     205                 :            :          * If hugetlbfs_put_super couldn't free spool due to an outstanding
     206                 :            :          * quota reference, free it now.
     207                 :            :          */
     208                 :          0 :         unlock_or_release_subpool(spool);
     209                 :            : 
     210                 :          0 :         return ret;
     211                 :            : }
     212                 :            : 
     213                 :          0 : static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
     214                 :            : {
     215         [ #  # ]:          0 :         return HUGETLBFS_SB(inode->i_sb)->spool;
     216                 :            : }
     217                 :            : 
     218                 :          0 : static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
     219                 :            : {
     220                 :          0 :         return subpool_inode(file_inode(vma->vm_file));
     221                 :            : }
     222                 :            : 
     223                 :            : /*
     224                 :            :  * Region tracking -- allows tracking of reservations and instantiated pages
     225                 :            :  *                    across the pages in a mapping.
     226                 :            :  *
     227                 :            :  * The region data structures are embedded into a resv_map and protected
     228                 :            :  * by a resv_map's lock.  The set of regions within the resv_map represent
     229                 :            :  * reservations for huge pages, or huge pages that have already been
     230                 :            :  * instantiated within the map.  The from and to elements are huge page
     231                 :            :  * indicies into the associated mapping.  from indicates the starting index
     232                 :            :  * of the region.  to represents the first index past the end of  the region.
     233                 :            :  *
     234                 :            :  * For example, a file region structure with from == 0 and to == 4 represents
     235                 :            :  * four huge pages in a mapping.  It is important to note that the to element
     236                 :            :  * represents the first element past the end of the region. This is used in
     237                 :            :  * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
     238                 :            :  *
     239                 :            :  * Interval notation of the form [from, to) will be used to indicate that
     240                 :            :  * the endpoint from is inclusive and to is exclusive.
     241                 :            :  */
     242                 :            : struct file_region {
     243                 :            :         struct list_head link;
     244                 :            :         long from;
     245                 :            :         long to;
     246                 :            : };
     247                 :            : 
     248                 :            : /* Must be called with resv->lock held. Calling this with count_only == true
     249                 :            :  * will count the number of pages to be added but will not modify the linked
     250                 :            :  * list.
     251                 :            :  */
     252                 :          0 : static long add_reservation_in_range(struct resv_map *resv, long f, long t,
     253                 :            :                                      bool count_only)
     254                 :            : {
     255                 :          0 :         long chg = 0;
     256                 :          0 :         struct list_head *head = &resv->regions;
     257                 :          0 :         struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
     258                 :            : 
     259                 :            :         /* Locate the region we are before or in. */
     260         [ #  # ]:          0 :         list_for_each_entry(rg, head, link)
     261         [ #  # ]:          0 :                 if (f <= rg->to)
     262                 :            :                         break;
     263                 :            : 
     264                 :            :         /* Round our left edge to the current segment if it encloses us. */
     265                 :          0 :         if (f > rg->from)
     266                 :            :                 f = rg->from;
     267                 :            : 
     268                 :          0 :         chg = t - f;
     269                 :            : 
     270                 :            :         /* Check for and consume any regions we now overlap with. */
     271                 :          0 :         nrg = rg;
     272         [ #  # ]:          0 :         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
     273         [ #  # ]:          0 :                 if (&rg->link == head)
     274                 :            :                         break;
     275         [ #  # ]:          0 :                 if (rg->from > t)
     276                 :            :                         break;
     277                 :            : 
     278                 :            :                 /* We overlap with this area, if it extends further than
     279                 :            :                  * us then we must extend ourselves.  Account for its
     280                 :            :                  * existing reservation.
     281                 :            :                  */
     282         [ #  # ]:          0 :                 if (rg->to > t) {
     283                 :          0 :                         chg += rg->to - t;
     284                 :          0 :                         t = rg->to;
     285                 :            :                 }
     286                 :          0 :                 chg -= rg->to - rg->from;
     287                 :            : 
     288         [ #  # ]:          0 :                 if (!count_only && rg != nrg) {
     289                 :          0 :                         list_del(&rg->link);
     290                 :          0 :                         kfree(rg);
     291                 :            :                 }
     292                 :            :         }
     293                 :            : 
     294         [ #  # ]:          0 :         if (!count_only) {
     295                 :          0 :                 nrg->from = f;
     296                 :          0 :                 nrg->to = t;
     297                 :            :         }
     298                 :            : 
     299                 :          0 :         return chg;
     300                 :            : }
     301                 :            : 
     302                 :            : /*
     303                 :            :  * Add the huge page range represented by [f, t) to the reserve
     304                 :            :  * map.  Existing regions will be expanded to accommodate the specified
     305                 :            :  * range, or a region will be taken from the cache.  Sufficient regions
     306                 :            :  * must exist in the cache due to the previous call to region_chg with
     307                 :            :  * the same range.
     308                 :            :  *
     309                 :            :  * Return the number of new huge pages added to the map.  This
     310                 :            :  * number is greater than or equal to zero.
     311                 :            :  */
     312                 :          0 : static long region_add(struct resv_map *resv, long f, long t)
     313                 :            : {
     314                 :          0 :         struct list_head *head = &resv->regions;
     315                 :          0 :         struct file_region *rg, *nrg;
     316                 :          0 :         long add = 0;
     317                 :            : 
     318                 :          0 :         spin_lock(&resv->lock);
     319                 :            :         /* Locate the region we are either in or before. */
     320         [ #  # ]:          0 :         list_for_each_entry(rg, head, link)
     321         [ #  # ]:          0 :                 if (f <= rg->to)
     322                 :            :                         break;
     323                 :            : 
     324                 :            :         /*
     325                 :            :          * If no region exists which can be expanded to include the
     326                 :            :          * specified range, pull a region descriptor from the cache
     327                 :            :          * and use it for this range.
     328                 :            :          */
     329   [ #  #  #  # ]:          0 :         if (&rg->link == head || t < rg->from) {
     330                 :          0 :                 VM_BUG_ON(resv->region_cache_count <= 0);
     331                 :            : 
     332                 :          0 :                 resv->region_cache_count--;
     333                 :          0 :                 nrg = list_first_entry(&resv->region_cache, struct file_region,
     334                 :            :                                         link);
     335                 :          0 :                 list_del(&nrg->link);
     336                 :            : 
     337                 :          0 :                 nrg->from = f;
     338                 :          0 :                 nrg->to = t;
     339                 :          0 :                 list_add(&nrg->link, rg->link.prev);
     340                 :            : 
     341                 :          0 :                 add += t - f;
     342                 :          0 :                 goto out_locked;
     343                 :            :         }
     344                 :            : 
     345                 :          0 :         add = add_reservation_in_range(resv, f, t, false);
     346                 :            : 
     347                 :          0 : out_locked:
     348                 :          0 :         resv->adds_in_progress--;
     349                 :          0 :         spin_unlock(&resv->lock);
     350                 :          0 :         VM_BUG_ON(add < 0);
     351                 :          0 :         return add;
     352                 :            : }
     353                 :            : 
     354                 :            : /*
     355                 :            :  * Examine the existing reserve map and determine how many
     356                 :            :  * huge pages in the specified range [f, t) are NOT currently
     357                 :            :  * represented.  This routine is called before a subsequent
     358                 :            :  * call to region_add that will actually modify the reserve
     359                 :            :  * map to add the specified range [f, t).  region_chg does
     360                 :            :  * not change the number of huge pages represented by the
     361                 :            :  * map.  A new file_region structure is added to the cache
     362                 :            :  * as a placeholder, so that the subsequent region_add
     363                 :            :  * call will have all the regions it needs and will not fail.
     364                 :            :  *
     365                 :            :  * Returns the number of huge pages that need to be added to the existing
     366                 :            :  * reservation map for the range [f, t).  This number is greater or equal to
     367                 :            :  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
     368                 :            :  * is needed and can not be allocated.
     369                 :            :  */
     370                 :          0 : static long region_chg(struct resv_map *resv, long f, long t)
     371                 :            : {
     372                 :          0 :         long chg = 0;
     373                 :            : 
     374                 :          0 :         spin_lock(&resv->lock);
     375                 :          0 : retry_locked:
     376                 :          0 :         resv->adds_in_progress++;
     377                 :            : 
     378                 :            :         /*
     379                 :            :          * Check for sufficient descriptors in the cache to accommodate
     380                 :            :          * the number of in progress add operations.
     381                 :            :          */
     382         [ #  # ]:          0 :         if (resv->adds_in_progress > resv->region_cache_count) {
     383                 :          0 :                 struct file_region *trg;
     384                 :            : 
     385                 :          0 :                 VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
     386                 :            :                 /* Must drop lock to allocate a new descriptor. */
     387                 :          0 :                 resv->adds_in_progress--;
     388                 :          0 :                 spin_unlock(&resv->lock);
     389                 :            : 
     390                 :          0 :                 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
     391         [ #  # ]:          0 :                 if (!trg)
     392                 :            :                         return -ENOMEM;
     393                 :            : 
     394                 :          0 :                 spin_lock(&resv->lock);
     395                 :          0 :                 list_add(&trg->link, &resv->region_cache);
     396                 :          0 :                 resv->region_cache_count++;
     397                 :          0 :                 goto retry_locked;
     398                 :            :         }
     399                 :            : 
     400                 :          0 :         chg = add_reservation_in_range(resv, f, t, true);
     401                 :            : 
     402                 :          0 :         spin_unlock(&resv->lock);
     403                 :          0 :         return chg;
     404                 :            : }
     405                 :            : 
     406                 :            : /*
     407                 :            :  * Abort the in progress add operation.  The adds_in_progress field
     408                 :            :  * of the resv_map keeps track of the operations in progress between
     409                 :            :  * calls to region_chg and region_add.  Operations are sometimes
     410                 :            :  * aborted after the call to region_chg.  In such cases, region_abort
     411                 :            :  * is called to decrement the adds_in_progress counter.
     412                 :            :  *
     413                 :            :  * NOTE: The range arguments [f, t) are not needed or used in this
     414                 :            :  * routine.  They are kept to make reading the calling code easier as
     415                 :            :  * arguments will match the associated region_chg call.
     416                 :            :  */
     417                 :          0 : static void region_abort(struct resv_map *resv, long f, long t)
     418                 :            : {
     419                 :          0 :         spin_lock(&resv->lock);
     420                 :          0 :         VM_BUG_ON(!resv->region_cache_count);
     421                 :          0 :         resv->adds_in_progress--;
     422                 :          0 :         spin_unlock(&resv->lock);
     423                 :          0 : }
     424                 :            : 
     425                 :            : /*
     426                 :            :  * Delete the specified range [f, t) from the reserve map.  If the
     427                 :            :  * t parameter is LONG_MAX, this indicates that ALL regions after f
     428                 :            :  * should be deleted.  Locate the regions which intersect [f, t)
     429                 :            :  * and either trim, delete or split the existing regions.
     430                 :            :  *
     431                 :            :  * Returns the number of huge pages deleted from the reserve map.
     432                 :            :  * In the normal case, the return value is zero or more.  In the
     433                 :            :  * case where a region must be split, a new region descriptor must
     434                 :            :  * be allocated.  If the allocation fails, -ENOMEM will be returned.
     435                 :            :  * NOTE: If the parameter t == LONG_MAX, then we will never split
     436                 :            :  * a region and possibly return -ENOMEM.  Callers specifying
     437                 :            :  * t == LONG_MAX do not need to check for -ENOMEM error.
     438                 :            :  */
     439                 :          0 : static long region_del(struct resv_map *resv, long f, long t)
     440                 :            : {
     441                 :          0 :         struct list_head *head = &resv->regions;
     442                 :          0 :         struct file_region *rg, *trg;
     443                 :          0 :         struct file_region *nrg = NULL;
     444                 :          0 :         long del = 0;
     445                 :            : 
     446                 :          0 : retry:
     447                 :          0 :         spin_lock(&resv->lock);
     448         [ #  # ]:          0 :         list_for_each_entry_safe(rg, trg, head, link) {
     449                 :            :                 /*
     450                 :            :                  * Skip regions before the range to be deleted.  file_region
     451                 :            :                  * ranges are normally of the form [from, to).  However, there
     452                 :            :                  * may be a "placeholder" entry in the map which is of the form
     453                 :            :                  * (from, to) with from == to.  Check for placeholder entries
     454                 :            :                  * at the beginning of the range to be deleted.
     455                 :            :                  */
     456   [ #  #  #  #  :          0 :                 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
                   #  # ]
     457                 :          0 :                         continue;
     458                 :            : 
     459         [ #  # ]:          0 :                 if (rg->from >= t)
     460                 :            :                         break;
     461                 :            : 
     462   [ #  #  #  # ]:          0 :                 if (f > rg->from && t < rg->to) { /* Must split region */
     463                 :            :                         /*
     464                 :            :                          * Check for an entry in the cache before dropping
     465                 :            :                          * lock and attempting allocation.
     466                 :            :                          */
     467         [ #  # ]:          0 :                         if (!nrg &&
     468         [ #  # ]:          0 :                             resv->region_cache_count > resv->adds_in_progress) {
     469                 :          0 :                                 nrg = list_first_entry(&resv->region_cache,
     470                 :            :                                                         struct file_region,
     471                 :            :                                                         link);
     472                 :          0 :                                 list_del(&nrg->link);
     473                 :          0 :                                 resv->region_cache_count--;
     474                 :            :                         }
     475                 :            : 
     476         [ #  # ]:          0 :                         if (!nrg) {
     477                 :          0 :                                 spin_unlock(&resv->lock);
     478                 :          0 :                                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
     479         [ #  # ]:          0 :                                 if (!nrg)
     480                 :            :                                         return -ENOMEM;
     481                 :          0 :                                 goto retry;
     482                 :            :                         }
     483                 :            : 
     484                 :          0 :                         del += t - f;
     485                 :            : 
     486                 :            :                         /* New entry for end of split region */
     487                 :          0 :                         nrg->from = t;
     488                 :          0 :                         nrg->to = rg->to;
     489                 :          0 :                         INIT_LIST_HEAD(&nrg->link);
     490                 :            : 
     491                 :            :                         /* Original entry is trimmed */
     492                 :          0 :                         rg->to = f;
     493                 :            : 
     494                 :          0 :                         list_add(&nrg->link, &rg->link);
     495                 :          0 :                         nrg = NULL;
     496                 :          0 :                         break;
     497                 :            :                 }
     498                 :            : 
     499   [ #  #  #  # ]:          0 :                 if (f <= rg->from && t >= rg->to) { /* Remove entire region */
     500                 :          0 :                         del += rg->to - rg->from;
     501                 :          0 :                         list_del(&rg->link);
     502                 :          0 :                         kfree(rg);
     503                 :          0 :                         continue;
     504                 :            :                 }
     505                 :            : 
     506         [ #  # ]:          0 :                 if (f <= rg->from) {      /* Trim beginning of region */
     507                 :          0 :                         del += t - rg->from;
     508                 :          0 :                         rg->from = t;
     509                 :            :                 } else {                /* Trim end of region */
     510                 :          0 :                         del += rg->to - f;
     511                 :          0 :                         rg->to = f;
     512                 :            :                 }
     513                 :            :         }
     514                 :            : 
     515                 :          0 :         spin_unlock(&resv->lock);
     516                 :          0 :         kfree(nrg);
     517                 :          0 :         return del;
     518                 :            : }
     519                 :            : 
     520                 :            : /*
     521                 :            :  * A rare out of memory error was encountered which prevented removal of
     522                 :            :  * the reserve map region for a page.  The huge page itself was free'ed
     523                 :            :  * and removed from the page cache.  This routine will adjust the subpool
     524                 :            :  * usage count, and the global reserve count if needed.  By incrementing
     525                 :            :  * these counts, the reserve map entry which could not be deleted will
     526                 :            :  * appear as a "reserved" entry instead of simply dangling with incorrect
     527                 :            :  * counts.
     528                 :            :  */
     529                 :          0 : void hugetlb_fix_reserve_counts(struct inode *inode)
     530                 :            : {
     531                 :          0 :         struct hugepage_subpool *spool = subpool_inode(inode);
     532                 :          0 :         long rsv_adjust;
     533                 :            : 
     534                 :          0 :         rsv_adjust = hugepage_subpool_get_pages(spool, 1);
     535         [ #  # ]:          0 :         if (rsv_adjust) {
     536                 :          0 :                 struct hstate *h = hstate_inode(inode);
     537                 :            : 
     538                 :          0 :                 hugetlb_acct_memory(h, 1);
     539                 :            :         }
     540                 :          0 : }
     541                 :            : 
     542                 :            : /*
     543                 :            :  * Count and return the number of huge pages in the reserve map
     544                 :            :  * that intersect with the range [f, t).
     545                 :            :  */
     546                 :          0 : static long region_count(struct resv_map *resv, long f, long t)
     547                 :            : {
     548                 :          0 :         struct list_head *head = &resv->regions;
     549                 :          0 :         struct file_region *rg;
     550                 :          0 :         long chg = 0;
     551                 :            : 
     552                 :          0 :         spin_lock(&resv->lock);
     553                 :            :         /* Locate each segment we overlap with, and count that overlap. */
     554         [ #  # ]:          0 :         list_for_each_entry(rg, head, link) {
     555                 :          0 :                 long seg_from;
     556                 :          0 :                 long seg_to;
     557                 :            : 
     558         [ #  # ]:          0 :                 if (rg->to <= f)
     559                 :          0 :                         continue;
     560         [ #  # ]:          0 :                 if (rg->from >= t)
     561                 :            :                         break;
     562                 :            : 
     563                 :          0 :                 seg_from = max(rg->from, f);
     564                 :          0 :                 seg_to = min(rg->to, t);
     565                 :            : 
     566                 :          0 :                 chg += seg_to - seg_from;
     567                 :            :         }
     568                 :          0 :         spin_unlock(&resv->lock);
     569                 :            : 
     570                 :          0 :         return chg;
     571                 :            : }
     572                 :            : 
     573                 :            : /*
     574                 :            :  * Convert the address within this vma to the page offset within
     575                 :            :  * the mapping, in pagecache page units; huge pages here.
     576                 :            :  */
     577                 :          0 : static pgoff_t vma_hugecache_offset(struct hstate *h,
     578                 :            :                         struct vm_area_struct *vma, unsigned long address)
     579                 :            : {
     580                 :          0 :         return ((address - vma->vm_start) >> huge_page_shift(h)) +
     581                 :          0 :                         (vma->vm_pgoff >> huge_page_order(h));
     582                 :            : }
     583                 :            : 
     584                 :          0 : pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
     585                 :            :                                      unsigned long address)
     586                 :            : {
     587                 :          0 :         return vma_hugecache_offset(hstate_vma(vma), vma, address);
     588                 :            : }
     589                 :            : EXPORT_SYMBOL_GPL(linear_hugepage_index);
     590                 :            : 
     591                 :            : /*
     592                 :            :  * Return the size of the pages allocated when backing a VMA. In the majority
     593                 :            :  * cases this will be same size as used by the page table entries.
     594                 :            :  */
     595                 :          0 : unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
     596                 :            : {
     597   [ #  #  #  #  :          0 :         if (vma->vm_ops && vma->vm_ops->pagesize)
                   #  # ]
     598                 :          0 :                 return vma->vm_ops->pagesize(vma);
     599                 :            :         return PAGE_SIZE;
     600                 :            : }
     601                 :            : EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
     602                 :            : 
     603                 :            : /*
     604                 :            :  * Return the page size being used by the MMU to back a VMA. In the majority
     605                 :            :  * of cases, the page size used by the kernel matches the MMU size. On
     606                 :            :  * architectures where it differs, an architecture-specific 'strong'
     607                 :            :  * version of this symbol is required.
     608                 :            :  */
     609                 :          0 : __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
     610                 :            : {
     611         [ #  # ]:          0 :         return vma_kernel_pagesize(vma);
     612                 :            : }
     613                 :            : 
     614                 :            : /*
     615                 :            :  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
     616                 :            :  * bits of the reservation map pointer, which are always clear due to
     617                 :            :  * alignment.
     618                 :            :  */
     619                 :            : #define HPAGE_RESV_OWNER    (1UL << 0)
     620                 :            : #define HPAGE_RESV_UNMAPPED (1UL << 1)
     621                 :            : #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
     622                 :            : 
     623                 :            : /*
     624                 :            :  * These helpers are used to track how many pages are reserved for
     625                 :            :  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
     626                 :            :  * is guaranteed to have their future faults succeed.
     627                 :            :  *
     628                 :            :  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
     629                 :            :  * the reserve counters are updated with the hugetlb_lock held. It is safe
     630                 :            :  * to reset the VMA at fork() time as it is not in use yet and there is no
     631                 :            :  * chance of the global counters getting corrupted as a result of the values.
     632                 :            :  *
     633                 :            :  * The private mapping reservation is represented in a subtly different
     634                 :            :  * manner to a shared mapping.  A shared mapping has a region map associated
     635                 :            :  * with the underlying file, this region map represents the backing file
     636                 :            :  * pages which have ever had a reservation assigned which this persists even
     637                 :            :  * after the page is instantiated.  A private mapping has a region map
     638                 :            :  * associated with the original mmap which is attached to all VMAs which
     639                 :            :  * reference it, this region map represents those offsets which have consumed
     640                 :            :  * reservation ie. where pages have been instantiated.
     641                 :            :  */
     642                 :          0 : static unsigned long get_vma_private_data(struct vm_area_struct *vma)
     643                 :            : {
     644                 :          0 :         return (unsigned long)vma->vm_private_data;
     645                 :            : }
     646                 :            : 
     647                 :          0 : static void set_vma_private_data(struct vm_area_struct *vma,
     648                 :            :                                                         unsigned long value)
     649                 :            : {
     650                 :          0 :         vma->vm_private_data = (void *)value;
     651                 :            : }
     652                 :            : 
     653                 :          0 : struct resv_map *resv_map_alloc(void)
     654                 :            : {
     655                 :          0 :         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
     656                 :          0 :         struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
     657                 :            : 
     658         [ #  # ]:          0 :         if (!resv_map || !rg) {
     659                 :          0 :                 kfree(resv_map);
     660                 :          0 :                 kfree(rg);
     661                 :          0 :                 return NULL;
     662                 :            :         }
     663                 :            : 
     664                 :          0 :         kref_init(&resv_map->refs);
     665                 :          0 :         spin_lock_init(&resv_map->lock);
     666                 :          0 :         INIT_LIST_HEAD(&resv_map->regions);
     667                 :            : 
     668                 :          0 :         resv_map->adds_in_progress = 0;
     669                 :            : 
     670                 :          0 :         INIT_LIST_HEAD(&resv_map->region_cache);
     671                 :          0 :         list_add(&rg->link, &resv_map->region_cache);
     672                 :          0 :         resv_map->region_cache_count = 1;
     673                 :            : 
     674                 :          0 :         return resv_map;
     675                 :            : }
     676                 :            : 
     677                 :          0 : void resv_map_release(struct kref *ref)
     678                 :            : {
     679                 :          0 :         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
     680                 :          0 :         struct list_head *head = &resv_map->region_cache;
     681                 :          0 :         struct file_region *rg, *trg;
     682                 :            : 
     683                 :            :         /* Clear out any active regions before we release the map. */
     684                 :          0 :         region_del(resv_map, 0, LONG_MAX);
     685                 :            : 
     686                 :            :         /* ... and any entries left in the cache */
     687         [ #  # ]:          0 :         list_for_each_entry_safe(rg, trg, head, link) {
     688                 :          0 :                 list_del(&rg->link);
     689                 :          0 :                 kfree(rg);
     690                 :            :         }
     691                 :            : 
     692                 :          0 :         VM_BUG_ON(resv_map->adds_in_progress);
     693                 :            : 
     694                 :          0 :         kfree(resv_map);
     695                 :          0 : }
     696                 :            : 
     697                 :          0 : static inline struct resv_map *inode_resv_map(struct inode *inode)
     698                 :            : {
     699                 :            :         /*
     700                 :            :          * At inode evict time, i_mapping may not point to the original
     701                 :            :          * address space within the inode.  This original address space
     702                 :            :          * contains the pointer to the resv_map.  So, always use the
     703                 :            :          * address space embedded within the inode.
     704                 :            :          * The VERY common case is inode->mapping == &inode->i_data but,
     705                 :            :          * this may not be true for device special inodes.
     706                 :            :          */
     707                 :          0 :         return (struct resv_map *)(&inode->i_data)->private_data;
     708                 :            : }
     709                 :            : 
     710                 :          0 : static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
     711                 :            : {
     712                 :          0 :         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
     713                 :          0 :         if (vma->vm_flags & VM_MAYSHARE) {
     714                 :          0 :                 struct address_space *mapping = vma->vm_file->f_mapping;
     715                 :          0 :                 struct inode *inode = mapping->host;
     716                 :            : 
     717                 :          0 :                 return inode_resv_map(inode);
     718                 :            : 
     719                 :            :         } else {
     720                 :          0 :                 return (struct resv_map *)(get_vma_private_data(vma) &
     721                 :            :                                                         ~HPAGE_RESV_MASK);
     722                 :            :         }
     723                 :            : }
     724                 :            : 
     725                 :          0 : static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
     726                 :            : {
     727                 :          0 :         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
     728                 :          0 :         VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
     729                 :            : 
     730                 :          0 :         set_vma_private_data(vma, (get_vma_private_data(vma) &
     731                 :          0 :                                 HPAGE_RESV_MASK) | (unsigned long)map);
     732                 :            : }
     733                 :            : 
     734                 :          0 : static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
     735                 :            : {
     736                 :          0 :         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
     737                 :          0 :         VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
     738                 :            : 
     739                 :          0 :         set_vma_private_data(vma, get_vma_private_data(vma) | flags);
     740                 :          0 : }
     741                 :            : 
     742                 :          0 : static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
     743                 :            : {
     744                 :          0 :         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
     745                 :            : 
     746                 :          0 :         return (get_vma_private_data(vma) & flag) != 0;
     747                 :            : }
     748                 :            : 
     749                 :            : /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
     750                 :          0 : void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
     751                 :            : {
     752                 :          0 :         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
     753         [ #  # ]:          0 :         if (!(vma->vm_flags & VM_MAYSHARE))
     754                 :          0 :                 vma->vm_private_data = (void *)0;
     755                 :          0 : }
     756                 :            : 
     757                 :            : /* Returns true if the VMA has associated reserve pages */
     758                 :          0 : static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
     759                 :            : {
     760         [ #  # ]:          0 :         if (vma->vm_flags & VM_NORESERVE) {
     761                 :            :                 /*
     762                 :            :                  * This address is already reserved by other process(chg == 0),
     763                 :            :                  * so, we should decrement reserved count. Without decrementing,
     764                 :            :                  * reserve count remains after releasing inode, because this
     765                 :            :                  * allocated page will go into page cache and is regarded as
     766                 :            :                  * coming from reserved pool in releasing step.  Currently, we
     767                 :            :                  * don't have any other solution to deal with this situation
     768                 :            :                  * properly, so add work-around here.
     769                 :            :                  */
     770   [ #  #  #  # ]:          0 :                 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
     771                 :            :                         return true;
     772                 :            :                 else
     773                 :          0 :                         return false;
     774                 :            :         }
     775                 :            : 
     776                 :            :         /* Shared mappings always use reserves */
     777         [ #  # ]:          0 :         if (vma->vm_flags & VM_MAYSHARE) {
     778                 :            :                 /*
     779                 :            :                  * We know VM_NORESERVE is not set.  Therefore, there SHOULD
     780                 :            :                  * be a region map for all pages.  The only situation where
     781                 :            :                  * there is no region map is if a hole was punched via
     782                 :            :                  * fallocate.  In this case, there really are no reverves to
     783                 :            :                  * use.  This situation is indicated if chg != 0.
     784                 :            :                  */
     785         [ #  # ]:          0 :                 if (chg)
     786                 :            :                         return false;
     787                 :            :                 else
     788                 :          0 :                         return true;
     789                 :            :         }
     790                 :            : 
     791                 :            :         /*
     792                 :            :          * Only the process that called mmap() has reserves for
     793                 :            :          * private mappings.
     794                 :            :          */
     795         [ #  # ]:          0 :         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
     796                 :            :                 /*
     797                 :            :                  * Like the shared case above, a hole punch or truncate
     798                 :            :                  * could have been performed on the private mapping.
     799                 :            :                  * Examine the value of chg to determine if reserves
     800                 :            :                  * actually exist or were previously consumed.
     801                 :            :                  * Very Subtle - The value of chg comes from a previous
     802                 :            :                  * call to vma_needs_reserves().  The reserve map for
     803                 :            :                  * private mappings has different (opposite) semantics
     804                 :            :                  * than that of shared mappings.  vma_needs_reserves()
     805                 :            :                  * has already taken this difference in semantics into
     806                 :            :                  * account.  Therefore, the meaning of chg is the same
     807                 :            :                  * as in the shared case above.  Code could easily be
     808                 :            :                  * combined, but keeping it separate draws attention to
     809                 :            :                  * subtle differences.
     810                 :            :                  */
     811         [ #  # ]:          0 :                 if (chg)
     812                 :            :                         return false;
     813                 :            :                 else
     814                 :          0 :                         return true;
     815                 :            :         }
     816                 :            : 
     817                 :            :         return false;
     818                 :            : }
     819                 :            : 
     820                 :          0 : static void enqueue_huge_page(struct hstate *h, struct page *page)
     821                 :            : {
     822                 :          0 :         int nid = page_to_nid(page);
     823                 :          0 :         list_move(&page->lru, &h->hugepage_freelists[nid]);
     824                 :          0 :         h->free_huge_pages++;
     825                 :          0 :         h->free_huge_pages_node[nid]++;
     826                 :          0 : }
     827                 :            : 
     828                 :          0 : static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
     829                 :            : {
     830                 :          0 :         struct page *page;
     831                 :            : 
     832                 :          0 :         list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
     833                 :            :                 if (!PageHWPoison(page))
     834                 :            :                         break;
     835                 :            :         /*
     836                 :            :          * if 'non-isolated free hugepage' not found on the list,
     837                 :            :          * the allocation fails.
     838                 :            :          */
     839         [ #  # ]:          0 :         if (&h->hugepage_freelists[nid] == &page->lru)
     840                 :            :                 return NULL;
     841                 :          0 :         list_move(&page->lru, &h->hugepage_activelist);
     842                 :          0 :         set_page_refcounted(page);
     843                 :          0 :         h->free_huge_pages--;
     844                 :          0 :         h->free_huge_pages_node[nid]--;
     845                 :          0 :         return page;
     846                 :            : }
     847                 :            : 
     848                 :          0 : static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
     849                 :            :                 nodemask_t *nmask)
     850                 :            : {
     851                 :          0 :         unsigned int cpuset_mems_cookie;
     852                 :          0 :         struct zonelist *zonelist;
     853                 :          0 :         struct zone *zone;
     854                 :          0 :         struct zoneref *z;
     855                 :          0 :         int node = NUMA_NO_NODE;
     856                 :            : 
     857         [ #  # ]:          0 :         zonelist = node_zonelist(nid, gfp_mask);
     858                 :            : 
     859                 :          0 : retry_cpuset:
     860                 :          0 :         cpuset_mems_cookie = read_mems_allowed_begin();
     861   [ #  #  #  #  :          0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
                   #  # ]
     862                 :          0 :                 struct page *page;
     863                 :            : 
     864         [ #  # ]:          0 :                 if (!cpuset_zone_allowed(zone, gfp_mask))
     865                 :          0 :                         continue;
     866                 :            :                 /*
     867                 :            :                  * no need to ask again on the same node. Pool is node rather than
     868                 :            :                  * zone aware
     869                 :            :                  */
     870         [ #  # ]:          0 :                 if (zone_to_nid(zone) == node)
     871                 :          0 :                         continue;
     872                 :          0 :                 node = zone_to_nid(zone);
     873                 :            : 
     874                 :          0 :                 page = dequeue_huge_page_node_exact(h, node);
     875         [ #  # ]:          0 :                 if (page)
     876                 :          0 :                         return page;
     877                 :            :         }
     878   [ #  #  #  # ]:          0 :         if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
     879                 :          0 :                 goto retry_cpuset;
     880                 :            : 
     881                 :            :         return NULL;
     882                 :            : }
     883                 :            : 
     884                 :            : /* Movability of hugepages depends on migration support. */
     885                 :          0 : static inline gfp_t htlb_alloc_mask(struct hstate *h)
     886                 :            : {
     887                 :          0 :         if (hugepage_movable_supported(h))
     888                 :            :                 return GFP_HIGHUSER_MOVABLE;
     889                 :            :         else
     890                 :          0 :                 return GFP_HIGHUSER;
     891                 :            : }
     892                 :            : 
     893                 :          0 : static struct page *dequeue_huge_page_vma(struct hstate *h,
     894                 :            :                                 struct vm_area_struct *vma,
     895                 :            :                                 unsigned long address, int avoid_reserve,
     896                 :            :                                 long chg)
     897                 :            : {
     898                 :          0 :         struct page *page;
     899                 :          0 :         struct mempolicy *mpol;
     900                 :          0 :         gfp_t gfp_mask;
     901                 :          0 :         nodemask_t *nodemask;
     902                 :          0 :         int nid;
     903                 :            : 
     904                 :            :         /*
     905                 :            :          * A child process with MAP_PRIVATE mappings created by their parent
     906                 :            :          * have no page reserves. This check ensures that reservations are
     907                 :            :          * not "stolen". The child may still get SIGKILLed
     908                 :            :          */
     909         [ #  # ]:          0 :         if (!vma_has_reserves(vma, chg) &&
     910         [ #  # ]:          0 :                         h->free_huge_pages - h->resv_huge_pages == 0)
     911                 :          0 :                 goto err;
     912                 :            : 
     913                 :            :         /* If reserves cannot be used, ensure enough pages are in the pool */
     914   [ #  #  #  # ]:          0 :         if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
     915                 :          0 :                 goto err;
     916                 :            : 
     917         [ #  # ]:          0 :         gfp_mask = htlb_alloc_mask(h);
     918                 :          0 :         nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
     919                 :          0 :         page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
     920   [ #  #  #  # ]:          0 :         if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
     921                 :          0 :                 SetPagePrivate(page);
     922                 :          0 :                 h->resv_huge_pages--;
     923                 :            :         }
     924                 :            : 
     925         [ #  # ]:          0 :         mpol_cond_put(mpol);
     926                 :            :         return page;
     927                 :            : 
     928                 :          0 : err:
     929                 :            :         return NULL;
     930                 :            : }
     931                 :            : 
     932                 :            : /*
     933                 :            :  * common helper functions for hstate_next_node_to_{alloc|free}.
     934                 :            :  * We may have allocated or freed a huge page based on a different
     935                 :            :  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
     936                 :            :  * be outside of *nodes_allowed.  Ensure that we use an allowed
     937                 :            :  * node for alloc or free.
     938                 :            :  */
     939                 :          0 : static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
     940                 :            : {
     941                 :          0 :         nid = next_node_in(nid, *nodes_allowed);
     942                 :          0 :         VM_BUG_ON(nid >= MAX_NUMNODES);
     943                 :            : 
     944                 :          0 :         return nid;
     945                 :            : }
     946                 :            : 
     947                 :          0 : static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
     948                 :            : {
     949         [ #  # ]:          0 :         if (!node_isset(nid, *nodes_allowed))
     950                 :          0 :                 nid = next_node_allowed(nid, nodes_allowed);
     951                 :          0 :         return nid;
     952                 :            : }
     953                 :            : 
     954                 :            : /*
     955                 :            :  * returns the previously saved node ["this node"] from which to
     956                 :            :  * allocate a persistent huge page for the pool and advance the
     957                 :            :  * next node from which to allocate, handling wrap at end of node
     958                 :            :  * mask.
     959                 :            :  */
     960                 :            : static int hstate_next_node_to_alloc(struct hstate *h,
     961                 :            :                                         nodemask_t *nodes_allowed)
     962                 :            : {
     963                 :            :         int nid;
     964                 :            : 
     965                 :            :         VM_BUG_ON(!nodes_allowed);
     966                 :            : 
     967                 :            :         nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
     968                 :            :         h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
     969                 :            : 
     970                 :            :         return nid;
     971                 :            : }
     972                 :            : 
     973                 :            : /*
     974                 :            :  * helper for free_pool_huge_page() - return the previously saved
     975                 :            :  * node ["this node"] from which to free a huge page.  Advance the
     976                 :            :  * next node id whether or not we find a free huge page to free so
     977                 :            :  * that the next attempt to free addresses the next node.
     978                 :            :  */
     979                 :            : static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
     980                 :            : {
     981                 :            :         int nid;
     982                 :            : 
     983                 :            :         VM_BUG_ON(!nodes_allowed);
     984                 :            : 
     985                 :            :         nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
     986                 :            :         h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
     987                 :            : 
     988                 :            :         return nid;
     989                 :            : }
     990                 :            : 
     991                 :            : #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
     992                 :            :         for (nr_nodes = nodes_weight(*mask);                            \
     993                 :            :                 nr_nodes > 0 &&                                              \
     994                 :            :                 ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
     995                 :            :                 nr_nodes--)
     996                 :            : 
     997                 :            : #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
     998                 :            :         for (nr_nodes = nodes_weight(*mask);                            \
     999                 :            :                 nr_nodes > 0 &&                                              \
    1000                 :            :                 ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
    1001                 :            :                 nr_nodes--)
    1002                 :            : 
    1003                 :            : #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
    1004                 :          0 : static void destroy_compound_gigantic_page(struct page *page,
    1005                 :            :                                         unsigned int order)
    1006                 :            : {
    1007                 :          0 :         int i;
    1008                 :          0 :         int nr_pages = 1 << order;
    1009                 :          0 :         struct page *p = page + 1;
    1010                 :            : 
    1011                 :          0 :         atomic_set(compound_mapcount_ptr(page), 0);
    1012         [ #  # ]:          0 :         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
    1013                 :          0 :                 clear_compound_head(p);
    1014                 :          0 :                 set_page_refcounted(p);
    1015                 :            :         }
    1016                 :            : 
    1017                 :          0 :         set_compound_order(page, 0);
    1018                 :          0 :         __ClearPageHead(page);
    1019                 :          0 : }
    1020                 :            : 
    1021                 :          0 : static void free_gigantic_page(struct page *page, unsigned int order)
    1022                 :            : {
    1023                 :          0 :         free_contig_range(page_to_pfn(page), 1 << order);
    1024                 :          0 : }
    1025                 :            : 
    1026                 :            : #ifdef CONFIG_CONTIG_ALLOC
    1027                 :            : static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
    1028                 :            :                 int nid, nodemask_t *nodemask)
    1029                 :            : {
    1030                 :            :         unsigned long nr_pages = 1UL << huge_page_order(h);
    1031                 :            : 
    1032                 :            :         return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
    1033                 :            : }
    1034                 :            : 
    1035                 :            : static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
    1036                 :            : static void prep_compound_gigantic_page(struct page *page, unsigned int order);
    1037                 :            : #else /* !CONFIG_CONTIG_ALLOC */
    1038                 :            : static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
    1039                 :            :                                         int nid, nodemask_t *nodemask)
    1040                 :            : {
    1041                 :            :         return NULL;
    1042                 :            : }
    1043                 :            : #endif /* CONFIG_CONTIG_ALLOC */
    1044                 :            : 
    1045                 :            : #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
    1046                 :            : static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
    1047                 :            :                                         int nid, nodemask_t *nodemask)
    1048                 :            : {
    1049                 :            :         return NULL;
    1050                 :            : }
    1051                 :            : static inline void free_gigantic_page(struct page *page, unsigned int order) { }
    1052                 :            : static inline void destroy_compound_gigantic_page(struct page *page,
    1053                 :            :                                                 unsigned int order) { }
    1054                 :            : #endif
    1055                 :            : 
    1056                 :          0 : static void update_and_free_page(struct hstate *h, struct page *page)
    1057                 :            : {
    1058                 :          0 :         int i;
    1059                 :            : 
    1060                 :          0 :         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
    1061                 :            :                 return;
    1062                 :            : 
    1063                 :          0 :         h->nr_huge_pages--;
    1064                 :          0 :         h->nr_huge_pages_node[page_to_nid(page)]--;
    1065         [ #  # ]:          0 :         for (i = 0; i < pages_per_huge_page(h); i++) {
    1066                 :          0 :                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
    1067                 :            :                                 1 << PG_referenced | 1 << PG_dirty |
    1068                 :            :                                 1 << PG_active | 1 << PG_private |
    1069                 :            :                                 1 << PG_writeback);
    1070                 :            :         }
    1071                 :          0 :         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
    1072                 :          0 :         set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
    1073                 :          0 :         set_page_refcounted(page);
    1074         [ #  # ]:          0 :         if (hstate_is_gigantic(h)) {
    1075                 :          0 :                 destroy_compound_gigantic_page(page, huge_page_order(h));
    1076                 :          0 :                 free_gigantic_page(page, huge_page_order(h));
    1077                 :            :         } else {
    1078                 :          0 :                 __free_pages(page, huge_page_order(h));
    1079                 :            :         }
    1080                 :            : }
    1081                 :            : 
    1082                 :          0 : struct hstate *size_to_hstate(unsigned long size)
    1083                 :            : {
    1084                 :        312 :         struct hstate *h;
    1085                 :            : 
    1086   [ -  +  -  +  :        312 :         for_each_hstate(h) {
          +  -  -  -  -  
                -  -  + ]
    1087   [ -  -  -  -  :         78 :                 if (huge_page_size(h) == size)
          -  +  -  -  -  
                -  -  - ]
    1088                 :          0 :                         return h;
    1089                 :            :         }
    1090                 :            :         return NULL;
    1091                 :            : }
    1092                 :            : 
    1093                 :            : /*
    1094                 :            :  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
    1095                 :            :  * to hstate->hugepage_activelist.)
    1096                 :            :  *
    1097                 :            :  * This function can be called for tail pages, but never returns true for them.
    1098                 :            :  */
    1099                 :          0 : bool page_huge_active(struct page *page)
    1100                 :            : {
    1101                 :          0 :         VM_BUG_ON_PAGE(!PageHuge(page), page);
    1102   [ #  #  #  # ]:          0 :         return PageHead(page) && PagePrivate(&page[1]);
    1103                 :            : }
    1104                 :            : 
    1105                 :            : /* never called for tail page */
    1106                 :          0 : static void set_page_huge_active(struct page *page)
    1107                 :            : {
    1108                 :          0 :         VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
    1109                 :          0 :         SetPagePrivate(&page[1]);
    1110                 :          0 : }
    1111                 :            : 
    1112                 :          0 : static void clear_page_huge_active(struct page *page)
    1113                 :            : {
    1114                 :          0 :         VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
    1115                 :          0 :         ClearPagePrivate(&page[1]);
    1116                 :            : }
    1117                 :            : 
    1118                 :            : /*
    1119                 :            :  * Internal hugetlb specific page flag. Do not use outside of the hugetlb
    1120                 :            :  * code
    1121                 :            :  */
    1122                 :          0 : static inline bool PageHugeTemporary(struct page *page)
    1123                 :            : {
    1124   [ #  #  #  # ]:          0 :         if (!PageHuge(page))
    1125                 :            :                 return false;
    1126                 :            : 
    1127                 :          0 :         return (unsigned long)page[2].mapping == -1U;
    1128                 :            : }
    1129                 :            : 
    1130                 :          0 : static inline void SetPageHugeTemporary(struct page *page)
    1131                 :            : {
    1132                 :          0 :         page[2].mapping = (void *)-1U;
    1133                 :            : }
    1134                 :            : 
    1135                 :          0 : static inline void ClearPageHugeTemporary(struct page *page)
    1136                 :            : {
    1137                 :          0 :         page[2].mapping = NULL;
    1138                 :            : }
    1139                 :            : 
    1140                 :          0 : static void __free_huge_page(struct page *page)
    1141                 :            : {
    1142                 :            :         /*
    1143                 :            :          * Can't pass hstate in here because it is called from the
    1144                 :            :          * compound page destructor.
    1145                 :            :          */
    1146                 :          0 :         struct hstate *h = page_hstate(page);
    1147                 :          0 :         int nid = page_to_nid(page);
    1148                 :          0 :         struct hugepage_subpool *spool =
    1149                 :          0 :                 (struct hugepage_subpool *)page_private(page);
    1150                 :          0 :         bool restore_reserve;
    1151                 :            : 
    1152                 :          0 :         VM_BUG_ON_PAGE(page_count(page), page);
    1153                 :          0 :         VM_BUG_ON_PAGE(page_mapcount(page), page);
    1154                 :            : 
    1155                 :          0 :         set_page_private(page, 0);
    1156                 :          0 :         page->mapping = NULL;
    1157                 :          0 :         restore_reserve = PagePrivate(page);
    1158                 :          0 :         ClearPagePrivate(page);
    1159                 :            : 
    1160                 :            :         /*
    1161                 :            :          * If PagePrivate() was set on page, page allocation consumed a
    1162                 :            :          * reservation.  If the page was associated with a subpool, there
    1163                 :            :          * would have been a page reserved in the subpool before allocation
    1164                 :            :          * via hugepage_subpool_get_pages().  Since we are 'restoring' the
    1165                 :            :          * reservtion, do not call hugepage_subpool_put_pages() as this will
    1166                 :            :          * remove the reserved page from the subpool.
    1167                 :            :          */
    1168         [ #  # ]:          0 :         if (!restore_reserve) {
    1169                 :            :                 /*
    1170                 :            :                  * A return code of zero implies that the subpool will be
    1171                 :            :                  * under its minimum size if the reservation is not restored
    1172                 :            :                  * after page is free.  Therefore, force restore_reserve
    1173                 :            :                  * operation.
    1174                 :            :                  */
    1175         [ #  # ]:          0 :                 if (hugepage_subpool_put_pages(spool, 1) == 0)
    1176                 :          0 :                         restore_reserve = true;
    1177                 :            :         }
    1178                 :            : 
    1179                 :          0 :         spin_lock(&hugetlb_lock);
    1180                 :          0 :         clear_page_huge_active(page);
    1181         [ #  # ]:          0 :         hugetlb_cgroup_uncharge_page(hstate_index(h),
    1182                 :            :                                      pages_per_huge_page(h), page);
    1183         [ #  # ]:          0 :         if (restore_reserve)
    1184                 :          0 :                 h->resv_huge_pages++;
    1185                 :            : 
    1186         [ #  # ]:          0 :         if (PageHugeTemporary(page)) {
    1187                 :          0 :                 list_del(&page->lru);
    1188                 :          0 :                 ClearPageHugeTemporary(page);
    1189                 :          0 :                 update_and_free_page(h, page);
    1190         [ #  # ]:          0 :         } else if (h->surplus_huge_pages_node[nid]) {
    1191                 :            :                 /* remove the page from active list */
    1192                 :          0 :                 list_del(&page->lru);
    1193                 :          0 :                 update_and_free_page(h, page);
    1194                 :          0 :                 h->surplus_huge_pages--;
    1195                 :          0 :                 h->surplus_huge_pages_node[nid]--;
    1196                 :            :         } else {
    1197                 :          0 :                 arch_clear_hugepage_flags(page);
    1198                 :          0 :                 enqueue_huge_page(h, page);
    1199                 :            :         }
    1200                 :          0 :         spin_unlock(&hugetlb_lock);
    1201                 :          0 : }
    1202                 :            : 
    1203                 :            : /*
    1204                 :            :  * As free_huge_page() can be called from a non-task context, we have
    1205                 :            :  * to defer the actual freeing in a workqueue to prevent potential
    1206                 :            :  * hugetlb_lock deadlock.
    1207                 :            :  *
    1208                 :            :  * free_hpage_workfn() locklessly retrieves the linked list of pages to
    1209                 :            :  * be freed and frees them one-by-one. As the page->mapping pointer is
    1210                 :            :  * going to be cleared in __free_huge_page() anyway, it is reused as the
    1211                 :            :  * llist_node structure of a lockless linked list of huge pages to be freed.
    1212                 :            :  */
    1213                 :            : static LLIST_HEAD(hpage_freelist);
    1214                 :            : 
    1215                 :          0 : static void free_hpage_workfn(struct work_struct *work)
    1216                 :            : {
    1217                 :          0 :         struct llist_node *node;
    1218                 :          0 :         struct page *page;
    1219                 :            : 
    1220                 :          0 :         node = llist_del_all(&hpage_freelist);
    1221                 :            : 
    1222         [ #  # ]:          0 :         while (node) {
    1223                 :          0 :                 page = container_of((struct address_space **)node,
    1224                 :            :                                      struct page, mapping);
    1225                 :          0 :                 node = node->next;
    1226                 :          0 :                 __free_huge_page(page);
    1227                 :            :         }
    1228                 :          0 : }
    1229                 :            : static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
    1230                 :            : 
    1231                 :          0 : void free_huge_page(struct page *page)
    1232                 :            : {
    1233                 :            :         /*
    1234                 :            :          * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
    1235                 :            :          */
    1236         [ #  # ]:          0 :         if (!in_task()) {
    1237                 :            :                 /*
    1238                 :            :                  * Only call schedule_work() if hpage_freelist is previously
    1239                 :            :                  * empty. Otherwise, schedule_work() had been called but the
    1240                 :            :                  * workfn hasn't retrieved the list yet.
    1241                 :            :                  */
    1242         [ #  # ]:          0 :                 if (llist_add((struct llist_node *)&page->mapping,
    1243                 :            :                               &hpage_freelist))
    1244                 :          0 :                         schedule_work(&free_hpage_work);
    1245                 :          0 :                 return;
    1246                 :            :         }
    1247                 :            : 
    1248                 :          0 :         __free_huge_page(page);
    1249                 :            : }
    1250                 :            : 
    1251                 :          0 : static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
    1252                 :            : {
    1253                 :          0 :         INIT_LIST_HEAD(&page->lru);
    1254                 :          0 :         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
    1255                 :          0 :         spin_lock(&hugetlb_lock);
    1256                 :          0 :         set_hugetlb_cgroup(page, NULL);
    1257                 :          0 :         h->nr_huge_pages++;
    1258                 :          0 :         h->nr_huge_pages_node[nid]++;
    1259                 :          0 :         spin_unlock(&hugetlb_lock);
    1260                 :            : }
    1261                 :            : 
    1262                 :          0 : static void prep_compound_gigantic_page(struct page *page, unsigned int order)
    1263                 :            : {
    1264                 :          0 :         int i;
    1265                 :          0 :         int nr_pages = 1 << order;
    1266                 :          0 :         struct page *p = page + 1;
    1267                 :            : 
    1268                 :            :         /* we rely on prep_new_huge_page to set the destructor */
    1269                 :          0 :         set_compound_order(page, order);
    1270                 :          0 :         __ClearPageReserved(page);
    1271                 :          0 :         __SetPageHead(page);
    1272         [ #  # ]:          0 :         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
    1273                 :            :                 /*
    1274                 :            :                  * For gigantic hugepages allocated through bootmem at
    1275                 :            :                  * boot, it's safer to be consistent with the not-gigantic
    1276                 :            :                  * hugepages and clear the PG_reserved bit from all tail pages
    1277                 :            :                  * too.  Otherwse drivers using get_user_pages() to access tail
    1278                 :            :                  * pages may get the reference counting wrong if they see
    1279                 :            :                  * PG_reserved set on a tail page (despite the head page not
    1280                 :            :                  * having PG_reserved set).  Enforcing this consistency between
    1281                 :            :                  * head and tail pages allows drivers to optimize away a check
    1282                 :            :                  * on the head page when they need know if put_page() is needed
    1283                 :            :                  * after get_user_pages().
    1284                 :            :                  */
    1285                 :          0 :                 __ClearPageReserved(p);
    1286                 :          0 :                 set_page_count(p, 0);
    1287                 :          0 :                 set_compound_head(p, page);
    1288                 :            :         }
    1289                 :          0 :         atomic_set(compound_mapcount_ptr(page), -1);
    1290                 :          0 : }
    1291                 :            : 
    1292                 :            : /*
    1293                 :            :  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
    1294                 :            :  * transparent huge pages.  See the PageTransHuge() documentation for more
    1295                 :            :  * details.
    1296                 :            :  */
    1297                 :   93977820 : int PageHuge(struct page *page)
    1298                 :            : {
    1299         [ -  + ]:  187955570 :         if (!PageCompound(page))
    1300                 :            :                 return 0;
    1301                 :            : 
    1302         [ #  # ]:          0 :         page = compound_head(page);
    1303                 :          0 :         return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
    1304                 :            : }
    1305                 :            : EXPORT_SYMBOL_GPL(PageHuge);
    1306                 :            : 
    1307                 :            : /*
    1308                 :            :  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
    1309                 :            :  * normal or transparent huge pages.
    1310                 :            :  */
    1311                 :          0 : int PageHeadHuge(struct page *page_head)
    1312                 :            : {
    1313         [ #  # ]:          0 :         if (!PageHead(page_head))
    1314                 :            :                 return 0;
    1315                 :            : 
    1316                 :          0 :         return get_compound_page_dtor(page_head) == free_huge_page;
    1317                 :            : }
    1318                 :            : 
    1319                 :          0 : pgoff_t __basepage_index(struct page *page)
    1320                 :            : {
    1321         [ #  # ]:          0 :         struct page *page_head = compound_head(page);
    1322                 :          0 :         pgoff_t index = page_index(page_head);
    1323                 :          0 :         unsigned long compound_idx;
    1324                 :            : 
    1325         [ #  # ]:          0 :         if (!PageHuge(page_head))
    1326                 :          0 :                 return page_index(page);
    1327                 :            : 
    1328         [ #  # ]:          0 :         if (compound_order(page_head) >= MAX_ORDER)
    1329                 :          0 :                 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
    1330                 :            :         else
    1331                 :          0 :                 compound_idx = page - page_head;
    1332                 :            : 
    1333                 :          0 :         return (index << compound_order(page_head)) + compound_idx;
    1334                 :            : }
    1335                 :            : 
    1336                 :          0 : static struct page *alloc_buddy_huge_page(struct hstate *h,
    1337                 :            :                 gfp_t gfp_mask, int nid, nodemask_t *nmask,
    1338                 :            :                 nodemask_t *node_alloc_noretry)
    1339                 :            : {
    1340         [ #  # ]:          0 :         int order = huge_page_order(h);
    1341                 :          0 :         struct page *page;
    1342                 :          0 :         bool alloc_try_hard = true;
    1343                 :            : 
    1344                 :            :         /*
    1345                 :            :          * By default we always try hard to allocate the page with
    1346                 :            :          * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
    1347                 :            :          * a loop (to adjust global huge page counts) and previous allocation
    1348                 :            :          * failed, do not continue to try hard on the same node.  Use the
    1349                 :            :          * node_alloc_noretry bitmap to manage this state information.
    1350                 :            :          */
    1351   [ #  #  #  # ]:          0 :         if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
    1352                 :          0 :                 alloc_try_hard = false;
    1353                 :          0 :         gfp_mask |= __GFP_COMP|__GFP_NOWARN;
    1354                 :          0 :         if (alloc_try_hard)
    1355                 :          0 :                 gfp_mask |= __GFP_RETRY_MAYFAIL;
    1356         [ #  # ]:          0 :         if (nid == NUMA_NO_NODE)
    1357                 :          0 :                 nid = numa_mem_id();
    1358                 :          0 :         page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
    1359         [ #  # ]:          0 :         if (page)
    1360                 :          0 :                 __count_vm_event(HTLB_BUDDY_PGALLOC);
    1361                 :            :         else
    1362                 :          0 :                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
    1363                 :            : 
    1364                 :            :         /*
    1365                 :            :          * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
    1366                 :            :          * indicates an overall state change.  Clear bit so that we resume
    1367                 :            :          * normal 'try hard' allocations.
    1368                 :            :          */
    1369   [ #  #  #  # ]:          0 :         if (node_alloc_noretry && page && !alloc_try_hard)
    1370                 :          0 :                 node_clear(nid, *node_alloc_noretry);
    1371                 :            : 
    1372                 :            :         /*
    1373                 :            :          * If we tried hard to get a page but failed, set bit so that
    1374                 :            :          * subsequent attempts will not try as hard until there is an
    1375                 :            :          * overall state change.
    1376                 :            :          */
    1377   [ #  #  #  # ]:          0 :         if (node_alloc_noretry && !page && alloc_try_hard)
    1378                 :          0 :                 node_set(nid, *node_alloc_noretry);
    1379                 :            : 
    1380                 :          0 :         return page;
    1381                 :            : }
    1382                 :            : 
    1383                 :            : /*
    1384                 :            :  * Common helper to allocate a fresh hugetlb page. All specific allocators
    1385                 :            :  * should use this function to get new hugetlb pages
    1386                 :            :  */
    1387                 :          0 : static struct page *alloc_fresh_huge_page(struct hstate *h,
    1388                 :            :                 gfp_t gfp_mask, int nid, nodemask_t *nmask,
    1389                 :            :                 nodemask_t *node_alloc_noretry)
    1390                 :            : {
    1391                 :          0 :         struct page *page;
    1392                 :            : 
    1393         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    1394                 :            :                 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
    1395                 :            :         else
    1396                 :          0 :                 page = alloc_buddy_huge_page(h, gfp_mask,
    1397                 :            :                                 nid, nmask, node_alloc_noretry);
    1398         [ #  # ]:          0 :         if (!page)
    1399                 :          0 :                 return NULL;
    1400                 :            : 
    1401         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    1402                 :          0 :                 prep_compound_gigantic_page(page, huge_page_order(h));
    1403                 :          0 :         prep_new_huge_page(h, page, page_to_nid(page));
    1404                 :            : 
    1405                 :          0 :         return page;
    1406                 :            : }
    1407                 :            : 
    1408                 :            : /*
    1409                 :            :  * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
    1410                 :            :  * manner.
    1411                 :            :  */
    1412                 :          0 : static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
    1413                 :            :                                 nodemask_t *node_alloc_noretry)
    1414                 :            : {
    1415                 :          0 :         struct page *page;
    1416                 :          0 :         int nr_nodes, node;
    1417         [ #  # ]:          0 :         gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
    1418                 :            : 
    1419         [ #  # ]:          0 :         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
    1420                 :          0 :                 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
    1421                 :            :                                                 node_alloc_noretry);
    1422         [ #  # ]:          0 :                 if (page)
    1423                 :            :                         break;
    1424                 :            :         }
    1425                 :            : 
    1426         [ #  # ]:          0 :         if (!page)
    1427                 :            :                 return 0;
    1428                 :            : 
    1429                 :          0 :         put_page(page); /* free it into the hugepage allocator */
    1430                 :            : 
    1431                 :          0 :         return 1;
    1432                 :            : }
    1433                 :            : 
    1434                 :            : /*
    1435                 :            :  * Free huge page from pool from next node to free.
    1436                 :            :  * Attempt to keep persistent huge pages more or less
    1437                 :            :  * balanced over allowed nodes.
    1438                 :            :  * Called with hugetlb_lock locked.
    1439                 :            :  */
    1440                 :          0 : static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
    1441                 :            :                                                          bool acct_surplus)
    1442                 :            : {
    1443                 :          0 :         int nr_nodes, node;
    1444                 :          0 :         int ret = 0;
    1445                 :            : 
    1446         [ #  # ]:          0 :         for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
    1447                 :            :                 /*
    1448                 :            :                  * If we're returning unused surplus pages, only examine
    1449                 :            :                  * nodes with surplus pages.
    1450                 :            :                  */
    1451   [ #  #  #  #  :          0 :                 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                   #  # ]
    1452         [ #  # ]:          0 :                     !list_empty(&h->hugepage_freelists[node])) {
    1453                 :          0 :                         struct page *page =
    1454                 :          0 :                                 list_entry(h->hugepage_freelists[node].next,
    1455                 :            :                                           struct page, lru);
    1456         [ #  # ]:          0 :                         list_del(&page->lru);
    1457                 :          0 :                         h->free_huge_pages--;
    1458                 :          0 :                         h->free_huge_pages_node[node]--;
    1459         [ #  # ]:          0 :                         if (acct_surplus) {
    1460                 :          0 :                                 h->surplus_huge_pages--;
    1461                 :          0 :                                 h->surplus_huge_pages_node[node]--;
    1462                 :            :                         }
    1463                 :          0 :                         update_and_free_page(h, page);
    1464                 :          0 :                         ret = 1;
    1465                 :          0 :                         break;
    1466                 :            :                 }
    1467                 :            :         }
    1468                 :            : 
    1469                 :          0 :         return ret;
    1470                 :            : }
    1471                 :            : 
    1472                 :            : /*
    1473                 :            :  * Dissolve a given free hugepage into free buddy pages. This function does
    1474                 :            :  * nothing for in-use hugepages and non-hugepages.
    1475                 :            :  * This function returns values like below:
    1476                 :            :  *
    1477                 :            :  *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
    1478                 :            :  *          (allocated or reserved.)
    1479                 :            :  *       0: successfully dissolved free hugepages or the page is not a
    1480                 :            :  *          hugepage (considered as already dissolved)
    1481                 :            :  */
    1482                 :          0 : int dissolve_free_huge_page(struct page *page)
    1483                 :            : {
    1484                 :          0 :         int rc = -EBUSY;
    1485                 :            : 
    1486                 :            :         /* Not to disrupt normal path by vainly holding hugetlb_lock */
    1487         [ #  # ]:          0 :         if (!PageHuge(page))
    1488                 :            :                 return 0;
    1489                 :            : 
    1490                 :          0 :         spin_lock(&hugetlb_lock);
    1491         [ #  # ]:          0 :         if (!PageHuge(page)) {
    1492                 :          0 :                 rc = 0;
    1493                 :          0 :                 goto out;
    1494                 :            :         }
    1495                 :            : 
    1496   [ #  #  #  # ]:          0 :         if (!page_count(page)) {
    1497         [ #  # ]:          0 :                 struct page *head = compound_head(page);
    1498                 :          0 :                 struct hstate *h = page_hstate(head);
    1499         [ #  # ]:          0 :                 int nid = page_to_nid(head);
    1500         [ #  # ]:          0 :                 if (h->free_huge_pages - h->resv_huge_pages == 0)
    1501                 :          0 :                         goto out;
    1502                 :            :                 /*
    1503                 :            :                  * Move PageHWPoison flag from head page to the raw error page,
    1504                 :            :                  * which makes any subpages rather than the error page reusable.
    1505                 :            :                  */
    1506                 :          0 :                 if (PageHWPoison(head) && page != head) {
    1507                 :            :                         SetPageHWPoison(page);
    1508                 :            :                         ClearPageHWPoison(head);
    1509                 :            :                 }
    1510                 :          0 :                 list_del(&head->lru);
    1511                 :          0 :                 h->free_huge_pages--;
    1512                 :          0 :                 h->free_huge_pages_node[nid]--;
    1513                 :          0 :                 h->max_huge_pages--;
    1514                 :          0 :                 update_and_free_page(h, head);
    1515                 :          0 :                 rc = 0;
    1516                 :            :         }
    1517                 :          0 : out:
    1518                 :          0 :         spin_unlock(&hugetlb_lock);
    1519                 :          0 :         return rc;
    1520                 :            : }
    1521                 :            : 
    1522                 :            : /*
    1523                 :            :  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
    1524                 :            :  * make specified memory blocks removable from the system.
    1525                 :            :  * Note that this will dissolve a free gigantic hugepage completely, if any
    1526                 :            :  * part of it lies within the given range.
    1527                 :            :  * Also note that if dissolve_free_huge_page() returns with an error, all
    1528                 :            :  * free hugepages that were dissolved before that error are lost.
    1529                 :            :  */
    1530                 :          0 : int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
    1531                 :            : {
    1532                 :          0 :         unsigned long pfn;
    1533                 :          0 :         struct page *page;
    1534                 :          0 :         int rc = 0;
    1535                 :            : 
    1536         [ #  # ]:          0 :         if (!hugepages_supported())
    1537                 :            :                 return rc;
    1538                 :            : 
    1539         [ #  # ]:          0 :         for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
    1540                 :          0 :                 page = pfn_to_page(pfn);
    1541                 :          0 :                 rc = dissolve_free_huge_page(page);
    1542         [ #  # ]:          0 :                 if (rc)
    1543                 :            :                         break;
    1544                 :            :         }
    1545                 :            : 
    1546                 :            :         return rc;
    1547                 :            : }
    1548                 :            : 
    1549                 :            : /*
    1550                 :            :  * Allocates a fresh surplus page from the page allocator.
    1551                 :            :  */
    1552                 :          0 : static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
    1553                 :            :                 int nid, nodemask_t *nmask)
    1554                 :            : {
    1555                 :          0 :         struct page *page = NULL;
    1556                 :            : 
    1557         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    1558                 :            :                 return NULL;
    1559                 :            : 
    1560                 :          0 :         spin_lock(&hugetlb_lock);
    1561         [ #  # ]:          0 :         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
    1562                 :          0 :                 goto out_unlock;
    1563                 :          0 :         spin_unlock(&hugetlb_lock);
    1564                 :            : 
    1565                 :          0 :         page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
    1566         [ #  # ]:          0 :         if (!page)
    1567                 :            :                 return NULL;
    1568                 :            : 
    1569                 :          0 :         spin_lock(&hugetlb_lock);
    1570                 :            :         /*
    1571                 :            :          * We could have raced with the pool size change.
    1572                 :            :          * Double check that and simply deallocate the new page
    1573                 :            :          * if we would end up overcommiting the surpluses. Abuse
    1574                 :            :          * temporary page to workaround the nasty free_huge_page
    1575                 :            :          * codeflow
    1576                 :            :          */
    1577         [ #  # ]:          0 :         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
    1578                 :          0 :                 SetPageHugeTemporary(page);
    1579                 :          0 :                 spin_unlock(&hugetlb_lock);
    1580                 :          0 :                 put_page(page);
    1581                 :          0 :                 return NULL;
    1582                 :            :         } else {
    1583                 :          0 :                 h->surplus_huge_pages++;
    1584                 :          0 :                 h->surplus_huge_pages_node[page_to_nid(page)]++;
    1585                 :            :         }
    1586                 :            : 
    1587                 :          0 : out_unlock:
    1588                 :          0 :         spin_unlock(&hugetlb_lock);
    1589                 :            : 
    1590                 :          0 :         return page;
    1591                 :            : }
    1592                 :            : 
    1593                 :          0 : struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
    1594                 :            :                                      int nid, nodemask_t *nmask)
    1595                 :            : {
    1596                 :          0 :         struct page *page;
    1597                 :            : 
    1598         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    1599                 :            :                 return NULL;
    1600                 :            : 
    1601                 :          0 :         page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
    1602   [ #  #  #  #  :          0 :         if (!page)
                   #  # ]
    1603                 :            :                 return NULL;
    1604                 :            : 
    1605                 :            :         /*
    1606                 :            :          * We do not account these pages as surplus because they are only
    1607                 :            :          * temporary and will be released properly on the last reference
    1608                 :            :          */
    1609                 :          0 :         SetPageHugeTemporary(page);
    1610                 :            : 
    1611                 :          0 :         return page;
    1612                 :            : }
    1613                 :            : 
    1614                 :            : /*
    1615                 :            :  * Use the VMA's mpolicy to allocate a huge page from the buddy.
    1616                 :            :  */
    1617                 :            : static
    1618                 :          0 : struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
    1619                 :            :                 struct vm_area_struct *vma, unsigned long addr)
    1620                 :            : {
    1621                 :          0 :         struct page *page;
    1622                 :          0 :         struct mempolicy *mpol;
    1623         [ #  # ]:          0 :         gfp_t gfp_mask = htlb_alloc_mask(h);
    1624                 :          0 :         int nid;
    1625                 :          0 :         nodemask_t *nodemask;
    1626                 :            : 
    1627                 :          0 :         nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
    1628                 :          0 :         page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
    1629         [ #  # ]:          0 :         mpol_cond_put(mpol);
    1630                 :            : 
    1631                 :          0 :         return page;
    1632                 :            : }
    1633                 :            : 
    1634                 :            : /* page migration callback function */
    1635                 :          0 : struct page *alloc_huge_page_node(struct hstate *h, int nid)
    1636                 :            : {
    1637         [ #  # ]:          0 :         gfp_t gfp_mask = htlb_alloc_mask(h);
    1638                 :          0 :         struct page *page = NULL;
    1639                 :            : 
    1640         [ #  # ]:          0 :         if (nid != NUMA_NO_NODE)
    1641                 :          0 :                 gfp_mask |= __GFP_THISNODE;
    1642                 :            : 
    1643                 :          0 :         spin_lock(&hugetlb_lock);
    1644         [ #  # ]:          0 :         if (h->free_huge_pages - h->resv_huge_pages > 0)
    1645                 :          0 :                 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
    1646                 :          0 :         spin_unlock(&hugetlb_lock);
    1647                 :            : 
    1648         [ #  # ]:          0 :         if (!page)
    1649         [ #  # ]:          0 :                 page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
    1650                 :            : 
    1651                 :          0 :         return page;
    1652                 :            : }
    1653                 :            : 
    1654                 :            : /* page migration callback function */
    1655                 :          0 : struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
    1656                 :            :                 nodemask_t *nmask)
    1657                 :            : {
    1658         [ #  # ]:          0 :         gfp_t gfp_mask = htlb_alloc_mask(h);
    1659                 :            : 
    1660                 :          0 :         spin_lock(&hugetlb_lock);
    1661         [ #  # ]:          0 :         if (h->free_huge_pages - h->resv_huge_pages > 0) {
    1662                 :          0 :                 struct page *page;
    1663                 :            : 
    1664                 :          0 :                 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
    1665         [ #  # ]:          0 :                 if (page) {
    1666                 :          0 :                         spin_unlock(&hugetlb_lock);
    1667                 :          0 :                         return page;
    1668                 :            :                 }
    1669                 :            :         }
    1670                 :          0 :         spin_unlock(&hugetlb_lock);
    1671                 :            : 
    1672         [ #  # ]:          0 :         return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
    1673                 :            : }
    1674                 :            : 
    1675                 :            : /* mempolicy aware migration callback */
    1676                 :          0 : struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
    1677                 :            :                 unsigned long address)
    1678                 :            : {
    1679                 :          0 :         struct mempolicy *mpol;
    1680                 :          0 :         nodemask_t *nodemask;
    1681                 :          0 :         struct page *page;
    1682                 :          0 :         gfp_t gfp_mask;
    1683                 :          0 :         int node;
    1684                 :            : 
    1685         [ #  # ]:          0 :         gfp_mask = htlb_alloc_mask(h);
    1686                 :          0 :         node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
    1687                 :          0 :         page = alloc_huge_page_nodemask(h, node, nodemask);
    1688         [ #  # ]:          0 :         mpol_cond_put(mpol);
    1689                 :            : 
    1690                 :          0 :         return page;
    1691                 :            : }
    1692                 :            : 
    1693                 :            : /*
    1694                 :            :  * Increase the hugetlb pool such that it can accommodate a reservation
    1695                 :            :  * of size 'delta'.
    1696                 :            :  */
    1697                 :          0 : static int gather_surplus_pages(struct hstate *h, int delta)
    1698                 :            : {
    1699                 :          0 :         struct list_head surplus_list;
    1700                 :          0 :         struct page *page, *tmp;
    1701                 :          0 :         int ret, i;
    1702                 :          0 :         int needed, allocated;
    1703                 :          0 :         bool alloc_ok = true;
    1704                 :            : 
    1705                 :          0 :         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
    1706         [ #  # ]:          0 :         if (needed <= 0) {
    1707                 :          0 :                 h->resv_huge_pages += delta;
    1708                 :          0 :                 return 0;
    1709                 :            :         }
    1710                 :            : 
    1711                 :          0 :         allocated = 0;
    1712                 :          0 :         INIT_LIST_HEAD(&surplus_list);
    1713                 :            : 
    1714                 :          0 :         ret = -ENOMEM;
    1715                 :          0 : retry:
    1716                 :          0 :         spin_unlock(&hugetlb_lock);
    1717         [ #  # ]:          0 :         for (i = 0; i < needed; i++) {
    1718         [ #  # ]:          0 :                 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
    1719                 :            :                                 NUMA_NO_NODE, NULL);
    1720         [ #  # ]:          0 :                 if (!page) {
    1721                 :            :                         alloc_ok = false;
    1722                 :            :                         break;
    1723                 :            :                 }
    1724                 :          0 :                 list_add(&page->lru, &surplus_list);
    1725                 :          0 :                 cond_resched();
    1726                 :            :         }
    1727                 :          0 :         allocated += i;
    1728                 :            : 
    1729                 :            :         /*
    1730                 :            :          * After retaking hugetlb_lock, we need to recalculate 'needed'
    1731                 :            :          * because either resv_huge_pages or free_huge_pages may have changed.
    1732                 :            :          */
    1733                 :          0 :         spin_lock(&hugetlb_lock);
    1734                 :          0 :         needed = (h->resv_huge_pages + delta) -
    1735                 :          0 :                         (h->free_huge_pages + allocated);
    1736         [ #  # ]:          0 :         if (needed > 0) {
    1737         [ #  # ]:          0 :                 if (alloc_ok)
    1738                 :          0 :                         goto retry;
    1739                 :            :                 /*
    1740                 :            :                  * We were not able to allocate enough pages to
    1741                 :            :                  * satisfy the entire reservation so we free what
    1742                 :            :                  * we've allocated so far.
    1743                 :            :                  */
    1744                 :          0 :                 goto free;
    1745                 :            :         }
    1746                 :            :         /*
    1747                 :            :          * The surplus_list now contains _at_least_ the number of extra pages
    1748                 :            :          * needed to accommodate the reservation.  Add the appropriate number
    1749                 :            :          * of pages to the hugetlb pool and free the extras back to the buddy
    1750                 :            :          * allocator.  Commit the entire reservation here to prevent another
    1751                 :            :          * process from stealing the pages as they are added to the pool but
    1752                 :            :          * before they are reserved.
    1753                 :            :          */
    1754                 :          0 :         needed += allocated;
    1755                 :          0 :         h->resv_huge_pages += delta;
    1756                 :          0 :         ret = 0;
    1757                 :            : 
    1758                 :            :         /* Free the needed pages to the hugetlb pool */
    1759         [ #  # ]:          0 :         list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
    1760         [ #  # ]:          0 :                 if ((--needed) < 0)
    1761                 :            :                         break;
    1762                 :            :                 /*
    1763                 :            :                  * This page is now managed by the hugetlb allocator and has
    1764                 :            :                  * no users -- drop the buddy allocator's reference.
    1765                 :            :                  */
    1766                 :          0 :                 put_page_testzero(page);
    1767                 :          0 :                 VM_BUG_ON_PAGE(page_count(page), page);
    1768                 :          0 :                 enqueue_huge_page(h, page);
    1769                 :            :         }
    1770                 :          0 : free:
    1771                 :          0 :         spin_unlock(&hugetlb_lock);
    1772                 :            : 
    1773                 :            :         /* Free unnecessary surplus pages to the buddy allocator */
    1774         [ #  # ]:          0 :         list_for_each_entry_safe(page, tmp, &surplus_list, lru)
    1775                 :          0 :                 put_page(page);
    1776                 :          0 :         spin_lock(&hugetlb_lock);
    1777                 :            : 
    1778                 :          0 :         return ret;
    1779                 :            : }
    1780                 :            : 
    1781                 :            : /*
    1782                 :            :  * This routine has two main purposes:
    1783                 :            :  * 1) Decrement the reservation count (resv_huge_pages) by the value passed
    1784                 :            :  *    in unused_resv_pages.  This corresponds to the prior adjustments made
    1785                 :            :  *    to the associated reservation map.
    1786                 :            :  * 2) Free any unused surplus pages that may have been allocated to satisfy
    1787                 :            :  *    the reservation.  As many as unused_resv_pages may be freed.
    1788                 :            :  *
    1789                 :            :  * Called with hugetlb_lock held.  However, the lock could be dropped (and
    1790                 :            :  * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
    1791                 :            :  * we must make sure nobody else can claim pages we are in the process of
    1792                 :            :  * freeing.  Do this by ensuring resv_huge_page always is greater than the
    1793                 :            :  * number of huge pages we plan to free when dropping the lock.
    1794                 :            :  */
    1795                 :          0 : static void return_unused_surplus_pages(struct hstate *h,
    1796                 :            :                                         unsigned long unused_resv_pages)
    1797                 :            : {
    1798                 :          0 :         unsigned long nr_pages;
    1799                 :            : 
    1800                 :            :         /* Cannot return gigantic pages currently */
    1801         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    1802                 :          0 :                 goto out;
    1803                 :            : 
    1804                 :            :         /*
    1805                 :            :          * Part (or even all) of the reservation could have been backed
    1806                 :            :          * by pre-allocated pages. Only free surplus pages.
    1807                 :            :          */
    1808                 :          0 :         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
    1809                 :            : 
    1810                 :            :         /*
    1811                 :            :          * We want to release as many surplus pages as possible, spread
    1812                 :            :          * evenly across all nodes with memory. Iterate across these nodes
    1813                 :            :          * until we can no longer free unreserved surplus pages. This occurs
    1814                 :            :          * when the nodes with surplus pages have no free pages.
    1815                 :            :          * free_pool_huge_page() will balance the the freed pages across the
    1816                 :            :          * on-line nodes with memory and will handle the hstate accounting.
    1817                 :            :          *
    1818                 :            :          * Note that we decrement resv_huge_pages as we free the pages.  If
    1819                 :            :          * we drop the lock, resv_huge_pages will still be sufficiently large
    1820                 :            :          * to cover subsequent pages we may free.
    1821                 :            :          */
    1822         [ #  # ]:          0 :         while (nr_pages--) {
    1823                 :          0 :                 h->resv_huge_pages--;
    1824                 :          0 :                 unused_resv_pages--;
    1825         [ #  # ]:          0 :                 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
    1826                 :          0 :                         goto out;
    1827                 :          0 :                 cond_resched_lock(&hugetlb_lock);
    1828                 :            :         }
    1829                 :            : 
    1830                 :          0 : out:
    1831                 :            :         /* Fully uncommit the reservation */
    1832                 :          0 :         h->resv_huge_pages -= unused_resv_pages;
    1833                 :          0 : }
    1834                 :            : 
    1835                 :            : 
    1836                 :            : /*
    1837                 :            :  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
    1838                 :            :  * are used by the huge page allocation routines to manage reservations.
    1839                 :            :  *
    1840                 :            :  * vma_needs_reservation is called to determine if the huge page at addr
    1841                 :            :  * within the vma has an associated reservation.  If a reservation is
    1842                 :            :  * needed, the value 1 is returned.  The caller is then responsible for
    1843                 :            :  * managing the global reservation and subpool usage counts.  After
    1844                 :            :  * the huge page has been allocated, vma_commit_reservation is called
    1845                 :            :  * to add the page to the reservation map.  If the page allocation fails,
    1846                 :            :  * the reservation must be ended instead of committed.  vma_end_reservation
    1847                 :            :  * is called in such cases.
    1848                 :            :  *
    1849                 :            :  * In the normal case, vma_commit_reservation returns the same value
    1850                 :            :  * as the preceding vma_needs_reservation call.  The only time this
    1851                 :            :  * is not the case is if a reserve map was changed between calls.  It
    1852                 :            :  * is the responsibility of the caller to notice the difference and
    1853                 :            :  * take appropriate action.
    1854                 :            :  *
    1855                 :            :  * vma_add_reservation is used in error paths where a reservation must
    1856                 :            :  * be restored when a newly allocated huge page must be freed.  It is
    1857                 :            :  * to be called after calling vma_needs_reservation to determine if a
    1858                 :            :  * reservation exists.
    1859                 :            :  */
    1860                 :            : enum vma_resv_mode {
    1861                 :            :         VMA_NEEDS_RESV,
    1862                 :            :         VMA_COMMIT_RESV,
    1863                 :            :         VMA_END_RESV,
    1864                 :            :         VMA_ADD_RESV,
    1865                 :            : };
    1866                 :          0 : static long __vma_reservation_common(struct hstate *h,
    1867                 :            :                                 struct vm_area_struct *vma, unsigned long addr,
    1868                 :            :                                 enum vma_resv_mode mode)
    1869                 :            : {
    1870                 :          0 :         struct resv_map *resv;
    1871                 :          0 :         pgoff_t idx;
    1872                 :          0 :         long ret;
    1873                 :            : 
    1874         [ #  # ]:          0 :         resv = vma_resv_map(vma);
    1875         [ #  # ]:          0 :         if (!resv)
    1876                 :            :                 return 1;
    1877                 :            : 
    1878   [ #  #  #  #  :          0 :         idx = vma_hugecache_offset(h, vma, addr);
                      # ]
    1879   [ #  #  #  #  :          0 :         switch (mode) {
                      # ]
    1880                 :          0 :         case VMA_NEEDS_RESV:
    1881                 :          0 :                 ret = region_chg(resv, idx, idx + 1);
    1882                 :          0 :                 break;
    1883                 :          0 :         case VMA_COMMIT_RESV:
    1884                 :          0 :                 ret = region_add(resv, idx, idx + 1);
    1885                 :          0 :                 break;
    1886                 :          0 :         case VMA_END_RESV:
    1887                 :          0 :                 region_abort(resv, idx, idx + 1);
    1888                 :          0 :                 ret = 0;
    1889                 :          0 :                 break;
    1890                 :          0 :         case VMA_ADD_RESV:
    1891         [ #  # ]:          0 :                 if (vma->vm_flags & VM_MAYSHARE)
    1892                 :          0 :                         ret = region_add(resv, idx, idx + 1);
    1893                 :            :                 else {
    1894                 :          0 :                         region_abort(resv, idx, idx + 1);
    1895                 :          0 :                         ret = region_del(resv, idx, idx + 1);
    1896                 :            :                 }
    1897                 :            :                 break;
    1898                 :          0 :         default:
    1899                 :          0 :                 BUG();
    1900                 :            :         }
    1901                 :            : 
    1902         [ #  # ]:          0 :         if (vma->vm_flags & VM_MAYSHARE)
    1903                 :            :                 return ret;
    1904   [ #  #  #  # ]:          0 :         else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
    1905                 :            :                 /*
    1906                 :            :                  * In most cases, reserves always exist for private mappings.
    1907                 :            :                  * However, a file associated with mapping could have been
    1908                 :            :                  * hole punched or truncated after reserves were consumed.
    1909                 :            :                  * As subsequent fault on such a range will not use reserves.
    1910                 :            :                  * Subtle - The reserve map for private mappings has the
    1911                 :            :                  * opposite meaning than that of shared mappings.  If NO
    1912                 :            :                  * entry is in the reserve map, it means a reservation exists.
    1913                 :            :                  * If an entry exists in the reserve map, it means the
    1914                 :            :                  * reservation has already been consumed.  As a result, the
    1915                 :            :                  * return value of this routine is the opposite of the
    1916                 :            :                  * value returned from reserve map manipulation routines above.
    1917                 :            :                  */
    1918         [ #  # ]:          0 :                 if (ret)
    1919                 :            :                         return 0;
    1920                 :            :                 else
    1921                 :          0 :                         return 1;
    1922                 :            :         }
    1923                 :            :         else
    1924                 :          0 :                 return ret < 0 ? ret : 0;
    1925                 :            : }
    1926                 :            : 
    1927                 :          0 : static long vma_needs_reservation(struct hstate *h,
    1928                 :            :                         struct vm_area_struct *vma, unsigned long addr)
    1929                 :            : {
    1930                 :          0 :         return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
    1931                 :            : }
    1932                 :            : 
    1933                 :          0 : static long vma_commit_reservation(struct hstate *h,
    1934                 :            :                         struct vm_area_struct *vma, unsigned long addr)
    1935                 :            : {
    1936                 :          0 :         return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
    1937                 :            : }
    1938                 :            : 
    1939                 :          0 : static void vma_end_reservation(struct hstate *h,
    1940                 :            :                         struct vm_area_struct *vma, unsigned long addr)
    1941                 :            : {
    1942                 :          0 :         (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
    1943                 :          0 : }
    1944                 :            : 
    1945                 :          0 : static long vma_add_reservation(struct hstate *h,
    1946                 :            :                         struct vm_area_struct *vma, unsigned long addr)
    1947                 :            : {
    1948                 :          0 :         return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
    1949                 :            : }
    1950                 :            : 
    1951                 :            : /*
    1952                 :            :  * This routine is called to restore a reservation on error paths.  In the
    1953                 :            :  * specific error paths, a huge page was allocated (via alloc_huge_page)
    1954                 :            :  * and is about to be freed.  If a reservation for the page existed,
    1955                 :            :  * alloc_huge_page would have consumed the reservation and set PagePrivate
    1956                 :            :  * in the newly allocated page.  When the page is freed via free_huge_page,
    1957                 :            :  * the global reservation count will be incremented if PagePrivate is set.
    1958                 :            :  * However, free_huge_page can not adjust the reserve map.  Adjust the
    1959                 :            :  * reserve map here to be consistent with global reserve count adjustments
    1960                 :            :  * to be made by free_huge_page.
    1961                 :            :  */
    1962                 :          0 : static void restore_reserve_on_error(struct hstate *h,
    1963                 :            :                         struct vm_area_struct *vma, unsigned long address,
    1964                 :            :                         struct page *page)
    1965                 :            : {
    1966         [ #  # ]:          0 :         if (unlikely(PagePrivate(page))) {
    1967                 :          0 :                 long rc = vma_needs_reservation(h, vma, address);
    1968                 :            : 
    1969         [ #  # ]:          0 :                 if (unlikely(rc < 0)) {
    1970                 :            :                         /*
    1971                 :            :                          * Rare out of memory condition in reserve map
    1972                 :            :                          * manipulation.  Clear PagePrivate so that
    1973                 :            :                          * global reserve count will not be incremented
    1974                 :            :                          * by free_huge_page.  This will make it appear
    1975                 :            :                          * as though the reservation for this page was
    1976                 :            :                          * consumed.  This may prevent the task from
    1977                 :            :                          * faulting in the page at a later time.  This
    1978                 :            :                          * is better than inconsistent global huge page
    1979                 :            :                          * accounting of reserve counts.
    1980                 :            :                          */
    1981                 :          0 :                         ClearPagePrivate(page);
    1982         [ #  # ]:          0 :                 } else if (rc) {
    1983                 :          0 :                         rc = vma_add_reservation(h, vma, address);
    1984         [ #  # ]:          0 :                         if (unlikely(rc < 0))
    1985                 :            :                                 /*
    1986                 :            :                                  * See above comment about rare out of
    1987                 :            :                                  * memory condition.
    1988                 :            :                                  */
    1989                 :          0 :                                 ClearPagePrivate(page);
    1990                 :            :                 } else
    1991                 :          0 :                         vma_end_reservation(h, vma, address);
    1992                 :            :         }
    1993                 :          0 : }
    1994                 :            : 
    1995                 :          0 : struct page *alloc_huge_page(struct vm_area_struct *vma,
    1996                 :            :                                     unsigned long addr, int avoid_reserve)
    1997                 :            : {
    1998                 :          0 :         struct hugepage_subpool *spool = subpool_vma(vma);
    1999                 :          0 :         struct hstate *h = hstate_vma(vma);
    2000                 :          0 :         struct page *page;
    2001                 :          0 :         long map_chg, map_commit;
    2002                 :          0 :         long gbl_chg;
    2003                 :          0 :         int ret, idx;
    2004                 :          0 :         struct hugetlb_cgroup *h_cg;
    2005                 :            : 
    2006                 :          0 :         idx = hstate_index(h);
    2007                 :            :         /*
    2008                 :            :          * Examine the region/reserve map to determine if the process
    2009                 :            :          * has a reservation for the page to be allocated.  A return
    2010                 :            :          * code of zero indicates a reservation exists (no change).
    2011                 :            :          */
    2012                 :          0 :         map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
    2013         [ #  # ]:          0 :         if (map_chg < 0)
    2014                 :            :                 return ERR_PTR(-ENOMEM);
    2015                 :            : 
    2016                 :            :         /*
    2017                 :            :          * Processes that did not create the mapping will have no
    2018                 :            :          * reserves as indicated by the region/reserve map. Check
    2019                 :            :          * that the allocation will not exceed the subpool limit.
    2020                 :            :          * Allocations for MAP_NORESERVE mappings also need to be
    2021                 :            :          * checked against any subpool limit.
    2022                 :            :          */
    2023         [ #  # ]:          0 :         if (map_chg || avoid_reserve) {
    2024                 :          0 :                 gbl_chg = hugepage_subpool_get_pages(spool, 1);
    2025         [ #  # ]:          0 :                 if (gbl_chg < 0) {
    2026                 :          0 :                         vma_end_reservation(h, vma, addr);
    2027                 :          0 :                         return ERR_PTR(-ENOSPC);
    2028                 :            :                 }
    2029                 :            : 
    2030                 :            :                 /*
    2031                 :            :                  * Even though there was no reservation in the region/reserve
    2032                 :            :                  * map, there could be reservations associated with the
    2033                 :            :                  * subpool that can be used.  This would be indicated if the
    2034                 :            :                  * return value of hugepage_subpool_get_pages() is zero.
    2035                 :            :                  * However, if avoid_reserve is specified we still avoid even
    2036                 :            :                  * the subpool reservations.
    2037                 :            :                  */
    2038         [ #  # ]:          0 :                 if (avoid_reserve)
    2039                 :          0 :                         gbl_chg = 1;
    2040                 :            :         }
    2041                 :            : 
    2042                 :          0 :         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
    2043                 :          0 :         if (ret)
    2044                 :            :                 goto out_subpool_put;
    2045                 :            : 
    2046                 :          0 :         spin_lock(&hugetlb_lock);
    2047                 :            :         /*
    2048                 :            :          * glb_chg is passed to indicate whether or not a page must be taken
    2049                 :            :          * from the global free pool (global change).  gbl_chg == 0 indicates
    2050                 :            :          * a reservation exists for the allocation.
    2051                 :            :          */
    2052                 :          0 :         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
    2053         [ #  # ]:          0 :         if (!page) {
    2054                 :          0 :                 spin_unlock(&hugetlb_lock);
    2055                 :          0 :                 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
    2056         [ #  # ]:          0 :                 if (!page)
    2057                 :          0 :                         goto out_uncharge_cgroup;
    2058   [ #  #  #  # ]:          0 :                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
    2059                 :          0 :                         SetPagePrivate(page);
    2060                 :          0 :                         h->resv_huge_pages--;
    2061                 :            :                 }
    2062                 :          0 :                 spin_lock(&hugetlb_lock);
    2063                 :          0 :                 list_move(&page->lru, &h->hugepage_activelist);
    2064                 :            :                 /* Fall through */
    2065                 :            :         }
    2066                 :          0 :         hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
    2067                 :          0 :         spin_unlock(&hugetlb_lock);
    2068                 :            : 
    2069                 :          0 :         set_page_private(page, (unsigned long)spool);
    2070                 :            : 
    2071                 :          0 :         map_commit = vma_commit_reservation(h, vma, addr);
    2072         [ #  # ]:          0 :         if (unlikely(map_chg > map_commit)) {
    2073                 :            :                 /*
    2074                 :            :                  * The page was added to the reservation map between
    2075                 :            :                  * vma_needs_reservation and vma_commit_reservation.
    2076                 :            :                  * This indicates a race with hugetlb_reserve_pages.
    2077                 :            :                  * Adjust for the subpool count incremented above AND
    2078                 :            :                  * in hugetlb_reserve_pages for the same page.  Also,
    2079                 :            :                  * the reservation count added in hugetlb_reserve_pages
    2080                 :            :                  * no longer applies.
    2081                 :            :                  */
    2082                 :          0 :                 long rsv_adjust;
    2083                 :            : 
    2084                 :          0 :                 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
    2085                 :          0 :                 hugetlb_acct_memory(h, -rsv_adjust);
    2086                 :            :         }
    2087                 :            :         return page;
    2088                 :            : 
    2089                 :            : out_uncharge_cgroup:
    2090         [ #  # ]:          0 :         hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
    2091                 :            : out_subpool_put:
    2092         [ #  # ]:          0 :         if (map_chg || avoid_reserve)
    2093                 :          0 :                 hugepage_subpool_put_pages(spool, 1);
    2094                 :          0 :         vma_end_reservation(h, vma, addr);
    2095                 :          0 :         return ERR_PTR(-ENOSPC);
    2096                 :            : }
    2097                 :            : 
    2098                 :            : int alloc_bootmem_huge_page(struct hstate *h)
    2099                 :            :         __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
    2100                 :          0 : int __alloc_bootmem_huge_page(struct hstate *h)
    2101                 :            : {
    2102                 :          0 :         struct huge_bootmem_page *m;
    2103                 :          0 :         int nr_nodes, node;
    2104                 :            : 
    2105         [ #  # ]:          0 :         for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
    2106                 :          0 :                 void *addr;
    2107                 :            : 
    2108                 :          0 :                 addr = memblock_alloc_try_nid_raw(
    2109                 :          0 :                                 huge_page_size(h), huge_page_size(h),
    2110                 :            :                                 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
    2111         [ #  # ]:          0 :                 if (addr) {
    2112                 :            :                         /*
    2113                 :            :                          * Use the beginning of the huge page to store the
    2114                 :            :                          * huge_bootmem_page struct (until gather_bootmem
    2115                 :            :                          * puts them into the mem_map).
    2116                 :            :                          */
    2117                 :          0 :                         m = addr;
    2118                 :          0 :                         goto found;
    2119                 :            :                 }
    2120                 :            :         }
    2121                 :            :         return 0;
    2122                 :            : 
    2123                 :            : found:
    2124   [ #  #  #  # ]:          0 :         BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
    2125                 :            :         /* Put them into a private list first because mem_map is not up yet */
    2126                 :          0 :         INIT_LIST_HEAD(&m->list);
    2127                 :          0 :         list_add(&m->list, &huge_boot_pages);
    2128                 :          0 :         m->hstate = h;
    2129                 :          0 :         return 1;
    2130                 :            : }
    2131                 :            : 
    2132                 :          0 : static void __init prep_compound_huge_page(struct page *page,
    2133                 :            :                 unsigned int order)
    2134                 :            : {
    2135         [ #  # ]:          0 :         if (unlikely(order > (MAX_ORDER - 1)))
    2136                 :          0 :                 prep_compound_gigantic_page(page, order);
    2137                 :            :         else
    2138                 :          0 :                 prep_compound_page(page, order);
    2139                 :          0 : }
    2140                 :            : 
    2141                 :            : /* Put bootmem huge pages into the standard lists after mem_map is up */
    2142                 :         78 : static void __init gather_bootmem_prealloc(void)
    2143                 :            : {
    2144                 :         78 :         struct huge_bootmem_page *m;
    2145                 :            : 
    2146         [ -  + ]:         78 :         list_for_each_entry(m, &huge_boot_pages, list) {
    2147         [ #  # ]:          0 :                 struct page *page = virt_to_page(m);
    2148                 :          0 :                 struct hstate *h = m->hstate;
    2149                 :            : 
    2150   [ #  #  #  # ]:          0 :                 WARN_ON(page_count(page) != 1);
    2151                 :          0 :                 prep_compound_huge_page(page, h->order);
    2152         [ #  # ]:          0 :                 WARN_ON(PageReserved(page));
    2153                 :          0 :                 prep_new_huge_page(h, page, page_to_nid(page));
    2154                 :          0 :                 put_page(page); /* free it into the hugepage allocator */
    2155                 :            : 
    2156                 :            :                 /*
    2157                 :            :                  * If we had gigantic hugepages allocated at boot time, we need
    2158                 :            :                  * to restore the 'stolen' pages to totalram_pages in order to
    2159                 :            :                  * fix confusing memory reports from free(1) and another
    2160                 :            :                  * side-effects, like CommitLimit going negative.
    2161                 :            :                  */
    2162         [ #  # ]:          0 :                 if (hstate_is_gigantic(h))
    2163                 :          0 :                         adjust_managed_page_count(page, 1 << h->order);
    2164                 :          0 :                 cond_resched();
    2165                 :            :         }
    2166                 :         78 : }
    2167                 :            : 
    2168                 :         78 : static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
    2169                 :            : {
    2170                 :         78 :         unsigned long i;
    2171                 :         78 :         nodemask_t *node_alloc_noretry;
    2172                 :            : 
    2173         [ +  - ]:         78 :         if (!hstate_is_gigantic(h)) {
    2174                 :            :                 /*
    2175                 :            :                  * Bit mask controlling how hard we retry per-node allocations.
    2176                 :            :                  * Ignore errors as lower level routines can deal with
    2177                 :            :                  * node_alloc_noretry == NULL.  If this kmalloc fails at boot
    2178                 :            :                  * time, we are likely in bigger trouble.
    2179                 :            :                  */
    2180                 :         78 :                 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
    2181                 :            :                                                 GFP_KERNEL);
    2182                 :            :         } else {
    2183                 :            :                 /* allocations done at boot time */
    2184                 :            :                 node_alloc_noretry = NULL;
    2185                 :            :         }
    2186                 :            : 
    2187                 :            :         /* bit mask controlling how hard we retry per-node allocations */
    2188         [ +  - ]:         78 :         if (node_alloc_noretry)
    2189                 :         78 :                 nodes_clear(*node_alloc_noretry);
    2190                 :            : 
    2191         [ -  + ]:         78 :         for (i = 0; i < h->max_huge_pages; ++i) {
    2192         [ #  # ]:          0 :                 if (hstate_is_gigantic(h)) {
    2193         [ #  # ]:          0 :                         if (!alloc_bootmem_huge_page(h))
    2194                 :            :                                 break;
    2195         [ #  # ]:          0 :                 } else if (!alloc_pool_huge_page(h,
    2196                 :            :                                          &node_states[N_MEMORY],
    2197                 :            :                                          node_alloc_noretry))
    2198                 :            :                         break;
    2199                 :          0 :                 cond_resched();
    2200                 :            :         }
    2201         [ -  + ]:         78 :         if (i < h->max_huge_pages) {
    2202                 :          0 :                 char buf[32];
    2203                 :            : 
    2204                 :          0 :                 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
    2205                 :          0 :                 pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
    2206                 :            :                         h->max_huge_pages, buf, i);
    2207                 :          0 :                 h->max_huge_pages = i;
    2208                 :            :         }
    2209                 :            : 
    2210                 :         78 :         kfree(node_alloc_noretry);
    2211                 :         78 : }
    2212                 :            : 
    2213                 :         78 : static void __init hugetlb_init_hstates(void)
    2214                 :            : {
    2215                 :         78 :         struct hstate *h;
    2216                 :            : 
    2217         [ +  + ]:        156 :         for_each_hstate(h) {
    2218         [ +  - ]:         78 :                 if (minimum_order > huge_page_order(h))
    2219                 :         78 :                         minimum_order = huge_page_order(h);
    2220                 :            : 
    2221                 :            :                 /* oversize hugepages were init'ed in early boot */
    2222         [ +  - ]:         78 :                 if (!hstate_is_gigantic(h))
    2223                 :         78 :                         hugetlb_hstate_alloc_pages(h);
    2224                 :            :         }
    2225                 :         78 :         VM_BUG_ON(minimum_order == UINT_MAX);
    2226                 :         78 : }
    2227                 :            : 
    2228                 :         78 : static void __init report_hugepages(void)
    2229                 :            : {
    2230                 :         78 :         struct hstate *h;
    2231                 :            : 
    2232         [ +  + ]:        156 :         for_each_hstate(h) {
    2233                 :         78 :                 char buf[32];
    2234                 :            : 
    2235                 :         78 :                 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
    2236                 :         78 :                 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
    2237                 :            :                         buf, h->free_huge_pages);
    2238                 :            :         }
    2239                 :         78 : }
    2240                 :            : 
    2241                 :            : #ifdef CONFIG_HIGHMEM
    2242                 :            : static void try_to_free_low(struct hstate *h, unsigned long count,
    2243                 :            :                                                 nodemask_t *nodes_allowed)
    2244                 :            : {
    2245                 :            :         int i;
    2246                 :            : 
    2247                 :            :         if (hstate_is_gigantic(h))
    2248                 :            :                 return;
    2249                 :            : 
    2250                 :            :         for_each_node_mask(i, *nodes_allowed) {
    2251                 :            :                 struct page *page, *next;
    2252                 :            :                 struct list_head *freel = &h->hugepage_freelists[i];
    2253                 :            :                 list_for_each_entry_safe(page, next, freel, lru) {
    2254                 :            :                         if (count >= h->nr_huge_pages)
    2255                 :            :                                 return;
    2256                 :            :                         if (PageHighMem(page))
    2257                 :            :                                 continue;
    2258                 :            :                         list_del(&page->lru);
    2259                 :            :                         update_and_free_page(h, page);
    2260                 :            :                         h->free_huge_pages--;
    2261                 :            :                         h->free_huge_pages_node[page_to_nid(page)]--;
    2262                 :            :                 }
    2263                 :            :         }
    2264                 :            : }
    2265                 :            : #else
    2266                 :          0 : static inline void try_to_free_low(struct hstate *h, unsigned long count,
    2267                 :            :                                                 nodemask_t *nodes_allowed)
    2268                 :            : {
    2269                 :            : }
    2270                 :            : #endif
    2271                 :            : 
    2272                 :            : /*
    2273                 :            :  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
    2274                 :            :  * balanced by operating on them in a round-robin fashion.
    2275                 :            :  * Returns 1 if an adjustment was made.
    2276                 :            :  */
    2277                 :          0 : static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
    2278                 :            :                                 int delta)
    2279                 :            : {
    2280                 :          0 :         int nr_nodes, node;
    2281                 :            : 
    2282                 :          0 :         VM_BUG_ON(delta != -1 && delta != 1);
    2283                 :            : 
    2284         [ #  # ]:          0 :         if (delta < 0) {
    2285         [ #  # ]:          0 :                 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
    2286         [ #  # ]:          0 :                         if (h->surplus_huge_pages_node[node])
    2287                 :          0 :                                 goto found;
    2288                 :            :                 }
    2289                 :            :         } else {
    2290         [ #  # ]:          0 :                 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
    2291                 :          0 :                         if (h->surplus_huge_pages_node[node] <
    2292         [ #  # ]:          0 :                                         h->nr_huge_pages_node[node])
    2293                 :          0 :                                 goto found;
    2294                 :            :                 }
    2295                 :            :         }
    2296                 :            :         return 0;
    2297                 :            : 
    2298                 :          0 : found:
    2299                 :          0 :         h->surplus_huge_pages += delta;
    2300                 :          0 :         h->surplus_huge_pages_node[node] += delta;
    2301                 :          0 :         return 1;
    2302                 :            : }
    2303                 :            : 
    2304                 :            : #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
    2305                 :          0 : static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
    2306                 :            :                               nodemask_t *nodes_allowed)
    2307                 :            : {
    2308                 :          0 :         unsigned long min_count, ret;
    2309                 :          0 :         NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
    2310                 :            : 
    2311                 :            :         /*
    2312                 :            :          * Bit mask controlling how hard we retry per-node allocations.
    2313                 :            :          * If we can not allocate the bit mask, do not attempt to allocate
    2314                 :            :          * the requested huge pages.
    2315                 :            :          */
    2316                 :          0 :         if (node_alloc_noretry)
    2317                 :          0 :                 nodes_clear(*node_alloc_noretry);
    2318                 :            :         else
    2319                 :            :                 return -ENOMEM;
    2320                 :            : 
    2321                 :          0 :         spin_lock(&hugetlb_lock);
    2322                 :            : 
    2323                 :            :         /*
    2324                 :            :          * Check for a node specific request.
    2325                 :            :          * Changing node specific huge page count may require a corresponding
    2326                 :            :          * change to the global count.  In any case, the passed node mask
    2327                 :            :          * (nodes_allowed) will restrict alloc/free to the specified node.
    2328                 :            :          */
    2329         [ #  # ]:          0 :         if (nid != NUMA_NO_NODE) {
    2330                 :          0 :                 unsigned long old_count = count;
    2331                 :            : 
    2332                 :          0 :                 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
    2333                 :            :                 /*
    2334                 :            :                  * User may have specified a large count value which caused the
    2335                 :            :                  * above calculation to overflow.  In this case, they wanted
    2336                 :            :                  * to allocate as many huge pages as possible.  Set count to
    2337                 :            :                  * largest possible value to align with their intention.
    2338                 :            :                  */
    2339         [ #  # ]:          0 :                 if (count < old_count)
    2340                 :          0 :                         count = ULONG_MAX;
    2341                 :            :         }
    2342                 :            : 
    2343                 :            :         /*
    2344                 :            :          * Gigantic pages runtime allocation depend on the capability for large
    2345                 :            :          * page range allocation.
    2346                 :            :          * If the system does not provide this feature, return an error when
    2347                 :            :          * the user tries to allocate gigantic pages but let the user free the
    2348                 :            :          * boottime allocated gigantic pages.
    2349                 :            :          */
    2350         [ #  # ]:          0 :         if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
    2351         [ #  # ]:          0 :                 if (count > persistent_huge_pages(h)) {
    2352                 :          0 :                         spin_unlock(&hugetlb_lock);
    2353                 :          0 :                         NODEMASK_FREE(node_alloc_noretry);
    2354                 :          0 :                         return -EINVAL;
    2355                 :            :                 }
    2356                 :            :                 /* Fall through to decrease pool */
    2357                 :            :         }
    2358                 :            : 
    2359                 :            :         /*
    2360                 :            :          * Increase the pool size
    2361                 :            :          * First take pages out of surplus state.  Then make up the
    2362                 :            :          * remaining difference by allocating fresh huge pages.
    2363                 :            :          *
    2364                 :            :          * We might race with alloc_surplus_huge_page() here and be unable
    2365                 :            :          * to convert a surplus huge page to a normal huge page. That is
    2366                 :            :          * not critical, though, it just means the overall size of the
    2367                 :            :          * pool might be one hugepage larger than it needs to be, but
    2368                 :            :          * within all the constraints specified by the sysctls.
    2369                 :            :          */
    2370   [ #  #  #  # ]:          0 :         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
    2371         [ #  # ]:          0 :                 if (!adjust_pool_surplus(h, nodes_allowed, -1))
    2372                 :            :                         break;
    2373                 :            :         }
    2374                 :            : 
    2375         [ #  # ]:          0 :         while (count > persistent_huge_pages(h)) {
    2376                 :            :                 /*
    2377                 :            :                  * If this allocation races such that we no longer need the
    2378                 :            :                  * page, free_huge_page will handle it by freeing the page
    2379                 :            :                  * and reducing the surplus.
    2380                 :            :                  */
    2381                 :          0 :                 spin_unlock(&hugetlb_lock);
    2382                 :            : 
    2383                 :            :                 /* yield cpu to avoid soft lockup */
    2384                 :          0 :                 cond_resched();
    2385                 :            : 
    2386                 :          0 :                 ret = alloc_pool_huge_page(h, nodes_allowed,
    2387                 :            :                                                 node_alloc_noretry);
    2388                 :          0 :                 spin_lock(&hugetlb_lock);
    2389         [ #  # ]:          0 :                 if (!ret)
    2390                 :          0 :                         goto out;
    2391                 :            : 
    2392                 :            :                 /* Bail for signals. Probably ctrl-c from user */
    2393         [ #  # ]:          0 :                 if (signal_pending(current))
    2394                 :          0 :                         goto out;
    2395                 :            :         }
    2396                 :            : 
    2397                 :            :         /*
    2398                 :            :          * Decrease the pool size
    2399                 :            :          * First return free pages to the buddy allocator (being careful
    2400                 :            :          * to keep enough around to satisfy reservations).  Then place
    2401                 :            :          * pages into surplus state as needed so the pool will shrink
    2402                 :            :          * to the desired size as pages become free.
    2403                 :            :          *
    2404                 :            :          * By placing pages into the surplus state independent of the
    2405                 :            :          * overcommit value, we are allowing the surplus pool size to
    2406                 :            :          * exceed overcommit. There are few sane options here. Since
    2407                 :            :          * alloc_surplus_huge_page() is checking the global counter,
    2408                 :            :          * though, we'll note that we're not allowed to exceed surplus
    2409                 :            :          * and won't grow the pool anywhere else. Not until one of the
    2410                 :            :          * sysctls are changed, or the surplus pages go out of use.
    2411                 :            :          */
    2412                 :          0 :         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
    2413                 :          0 :         min_count = max(count, min_count);
    2414                 :          0 :         try_to_free_low(h, min_count, nodes_allowed);
    2415         [ #  # ]:          0 :         while (min_count < persistent_huge_pages(h)) {
    2416         [ #  # ]:          0 :                 if (!free_pool_huge_page(h, nodes_allowed, 0))
    2417                 :            :                         break;
    2418                 :          0 :                 cond_resched_lock(&hugetlb_lock);
    2419                 :            :         }
    2420         [ #  # ]:          0 :         while (count < persistent_huge_pages(h)) {
    2421         [ #  # ]:          0 :                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
    2422                 :            :                         break;
    2423                 :            :         }
    2424                 :          0 : out:
    2425                 :          0 :         h->max_huge_pages = persistent_huge_pages(h);
    2426                 :          0 :         spin_unlock(&hugetlb_lock);
    2427                 :            : 
    2428                 :          0 :         NODEMASK_FREE(node_alloc_noretry);
    2429                 :            : 
    2430                 :          0 :         return 0;
    2431                 :            : }
    2432                 :            : 
    2433                 :            : #define HSTATE_ATTR_RO(_name) \
    2434                 :            :         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
    2435                 :            : 
    2436                 :            : #define HSTATE_ATTR(_name) \
    2437                 :            :         static struct kobj_attribute _name##_attr = \
    2438                 :            :                 __ATTR(_name, 0644, _name##_show, _name##_store)
    2439                 :            : 
    2440                 :            : static struct kobject *hugepages_kobj;
    2441                 :            : static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
    2442                 :            : 
    2443                 :            : static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
    2444                 :            : 
    2445                 :          0 : static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
    2446                 :            : {
    2447                 :          0 :         int i;
    2448                 :            : 
    2449   [ #  #  #  #  :          0 :         for (i = 0; i < HUGE_MAX_HSTATE; i++)
          #  #  #  #  #  
                #  #  # ]
    2450   [ #  #  #  #  :          0 :                 if (hstate_kobjs[i] == kobj) {
          #  #  #  #  #  
                #  #  # ]
    2451                 :          0 :                         if (nidp)
    2452                 :          0 :                                 *nidp = NUMA_NO_NODE;
    2453                 :          0 :                         return &hstates[i];
    2454                 :            :                 }
    2455                 :            : 
    2456                 :          0 :         return kobj_to_node_hstate(kobj, nidp);
    2457                 :            : }
    2458                 :            : 
    2459                 :            : static ssize_t nr_hugepages_show_common(struct kobject *kobj,
    2460                 :            :                                         struct kobj_attribute *attr, char *buf)
    2461                 :            : {
    2462                 :            :         struct hstate *h;
    2463                 :            :         unsigned long nr_huge_pages;
    2464                 :            :         int nid;
    2465                 :            : 
    2466                 :            :         h = kobj_to_hstate(kobj, &nid);
    2467                 :            :         if (nid == NUMA_NO_NODE)
    2468                 :            :                 nr_huge_pages = h->nr_huge_pages;
    2469                 :            :         else
    2470                 :            :                 nr_huge_pages = h->nr_huge_pages_node[nid];
    2471                 :            : 
    2472                 :            :         return sprintf(buf, "%lu\n", nr_huge_pages);
    2473                 :            : }
    2474                 :            : 
    2475                 :          0 : static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
    2476                 :            :                                            struct hstate *h, int nid,
    2477                 :            :                                            unsigned long count, size_t len)
    2478                 :            : {
    2479                 :          0 :         int err;
    2480                 :          0 :         nodemask_t nodes_allowed, *n_mask;
    2481                 :            : 
    2482         [ #  # ]:          0 :         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
    2483                 :            :                 return -EINVAL;
    2484                 :            : 
    2485         [ #  # ]:          0 :         if (nid == NUMA_NO_NODE) {
    2486                 :            :                 /*
    2487                 :            :                  * global hstate attribute
    2488                 :            :                  */
    2489   [ #  #  #  # ]:          0 :                 if (!(obey_mempolicy &&
    2490                 :          0 :                                 init_nodemask_of_mempolicy(&nodes_allowed)))
    2491                 :            :                         n_mask = &node_states[N_MEMORY];
    2492                 :            :                 else
    2493                 :            :                         n_mask = &nodes_allowed;
    2494                 :            :         } else {
    2495                 :            :                 /*
    2496                 :            :                  * Node specific request.  count adjustment happens in
    2497                 :            :                  * set_max_huge_pages() after acquiring hugetlb_lock.
    2498                 :            :                  */
    2499                 :          0 :                 init_nodemask_of_node(&nodes_allowed, nid);
    2500                 :          0 :                 n_mask = &nodes_allowed;
    2501                 :            :         }
    2502                 :            : 
    2503                 :          0 :         err = set_max_huge_pages(h, count, nid, n_mask);
    2504                 :            : 
    2505         [ #  # ]:          0 :         return err ? err : len;
    2506                 :            : }
    2507                 :            : 
    2508                 :          0 : static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
    2509                 :            :                                          struct kobject *kobj, const char *buf,
    2510                 :            :                                          size_t len)
    2511                 :            : {
    2512                 :          0 :         struct hstate *h;
    2513                 :          0 :         unsigned long count;
    2514                 :          0 :         int nid;
    2515                 :          0 :         int err;
    2516                 :            : 
    2517                 :          0 :         err = kstrtoul(buf, 10, &count);
    2518         [ #  # ]:          0 :         if (err)
    2519                 :          0 :                 return err;
    2520                 :            : 
    2521                 :            :         h = kobj_to_hstate(kobj, &nid);
    2522                 :          0 :         return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
    2523                 :            : }
    2524                 :            : 
    2525                 :          0 : static ssize_t nr_hugepages_show(struct kobject *kobj,
    2526                 :            :                                        struct kobj_attribute *attr, char *buf)
    2527                 :            : {
    2528                 :          0 :         return nr_hugepages_show_common(kobj, attr, buf);
    2529                 :            : }
    2530                 :            : 
    2531                 :          0 : static ssize_t nr_hugepages_store(struct kobject *kobj,
    2532                 :            :                struct kobj_attribute *attr, const char *buf, size_t len)
    2533                 :            : {
    2534                 :          0 :         return nr_hugepages_store_common(false, kobj, buf, len);
    2535                 :            : }
    2536                 :            : HSTATE_ATTR(nr_hugepages);
    2537                 :            : 
    2538                 :            : #ifdef CONFIG_NUMA
    2539                 :            : 
    2540                 :            : /*
    2541                 :            :  * hstate attribute for optionally mempolicy-based constraint on persistent
    2542                 :            :  * huge page alloc/free.
    2543                 :            :  */
    2544                 :          0 : static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
    2545                 :            :                                        struct kobj_attribute *attr, char *buf)
    2546                 :            : {
    2547                 :          0 :         return nr_hugepages_show_common(kobj, attr, buf);
    2548                 :            : }
    2549                 :            : 
    2550                 :          0 : static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
    2551                 :            :                struct kobj_attribute *attr, const char *buf, size_t len)
    2552                 :            : {
    2553                 :          0 :         return nr_hugepages_store_common(true, kobj, buf, len);
    2554                 :            : }
    2555                 :            : HSTATE_ATTR(nr_hugepages_mempolicy);
    2556                 :            : #endif
    2557                 :            : 
    2558                 :            : 
    2559                 :          0 : static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
    2560                 :            :                                         struct kobj_attribute *attr, char *buf)
    2561                 :            : {
    2562                 :          0 :         struct hstate *h = kobj_to_hstate(kobj, NULL);
    2563                 :          0 :         return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
    2564                 :            : }
    2565                 :            : 
    2566                 :          0 : static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
    2567                 :            :                 struct kobj_attribute *attr, const char *buf, size_t count)
    2568                 :            : {
    2569                 :          0 :         int err;
    2570                 :          0 :         unsigned long input;
    2571                 :          0 :         struct hstate *h = kobj_to_hstate(kobj, NULL);
    2572                 :            : 
    2573         [ #  # ]:          0 :         if (hstate_is_gigantic(h))
    2574                 :            :                 return -EINVAL;
    2575                 :            : 
    2576                 :          0 :         err = kstrtoul(buf, 10, &input);
    2577         [ #  # ]:          0 :         if (err)
    2578                 :          0 :                 return err;
    2579                 :            : 
    2580                 :          0 :         spin_lock(&hugetlb_lock);
    2581                 :          0 :         h->nr_overcommit_huge_pages = input;
    2582                 :          0 :         spin_unlock(&hugetlb_lock);
    2583                 :            : 
    2584                 :          0 :         return count;
    2585                 :            : }
    2586                 :            : HSTATE_ATTR(nr_overcommit_hugepages);
    2587                 :            : 
    2588                 :          0 : static ssize_t free_hugepages_show(struct kobject *kobj,
    2589                 :            :                                         struct kobj_attribute *attr, char *buf)
    2590                 :            : {
    2591                 :          0 :         struct hstate *h;
    2592                 :          0 :         unsigned long free_huge_pages;
    2593                 :          0 :         int nid;
    2594                 :            : 
    2595                 :          0 :         h = kobj_to_hstate(kobj, &nid);
    2596         [ #  # ]:          0 :         if (nid == NUMA_NO_NODE)
    2597                 :          0 :                 free_huge_pages = h->free_huge_pages;
    2598                 :            :         else
    2599                 :          0 :                 free_huge_pages = h->free_huge_pages_node[nid];
    2600                 :            : 
    2601                 :          0 :         return sprintf(buf, "%lu\n", free_huge_pages);
    2602                 :            : }
    2603                 :            : HSTATE_ATTR_RO(free_hugepages);
    2604                 :            : 
    2605                 :          0 : static ssize_t resv_hugepages_show(struct kobject *kobj,
    2606                 :            :                                         struct kobj_attribute *attr, char *buf)
    2607                 :            : {
    2608                 :          0 :         struct hstate *h = kobj_to_hstate(kobj, NULL);
    2609                 :          0 :         return sprintf(buf, "%lu\n", h->resv_huge_pages);
    2610                 :            : }
    2611                 :            : HSTATE_ATTR_RO(resv_hugepages);
    2612                 :            : 
    2613                 :          0 : static ssize_t surplus_hugepages_show(struct kobject *kobj,
    2614                 :            :                                         struct kobj_attribute *attr, char *buf)
    2615                 :            : {
    2616                 :          0 :         struct hstate *h;
    2617                 :          0 :         unsigned long surplus_huge_pages;
    2618                 :          0 :         int nid;
    2619                 :            : 
    2620                 :          0 :         h = kobj_to_hstate(kobj, &nid);
    2621         [ #  # ]:          0 :         if (nid == NUMA_NO_NODE)
    2622                 :          0 :                 surplus_huge_pages = h->surplus_huge_pages;
    2623                 :            :         else
    2624                 :          0 :                 surplus_huge_pages = h->surplus_huge_pages_node[nid];
    2625                 :            : 
    2626                 :          0 :         return sprintf(buf, "%lu\n", surplus_huge_pages);
    2627                 :            : }
    2628                 :            : HSTATE_ATTR_RO(surplus_hugepages);
    2629                 :            : 
    2630                 :            : static struct attribute *hstate_attrs[] = {
    2631                 :            :         &nr_hugepages_attr.attr,
    2632                 :            :         &nr_overcommit_hugepages_attr.attr,
    2633                 :            :         &free_hugepages_attr.attr,
    2634                 :            :         &resv_hugepages_attr.attr,
    2635                 :            :         &surplus_hugepages_attr.attr,
    2636                 :            : #ifdef CONFIG_NUMA
    2637                 :            :         &nr_hugepages_mempolicy_attr.attr,
    2638                 :            : #endif
    2639                 :            :         NULL,
    2640                 :            : };
    2641                 :            : 
    2642                 :            : static const struct attribute_group hstate_attr_group = {
    2643                 :            :         .attrs = hstate_attrs,
    2644                 :            : };
    2645                 :            : 
    2646                 :        156 : static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
    2647                 :            :                                     struct kobject **hstate_kobjs,
    2648                 :            :                                     const struct attribute_group *hstate_attr_group)
    2649                 :            : {
    2650                 :        156 :         int retval;
    2651                 :        156 :         int hi = hstate_index(h);
    2652                 :            : 
    2653                 :        156 :         hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
    2654         [ +  - ]:        156 :         if (!hstate_kobjs[hi])
    2655                 :            :                 return -ENOMEM;
    2656                 :            : 
    2657                 :        156 :         retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
    2658         [ -  + ]:        156 :         if (retval)
    2659                 :          0 :                 kobject_put(hstate_kobjs[hi]);
    2660                 :            : 
    2661                 :            :         return retval;
    2662                 :            : }
    2663                 :            : 
    2664                 :         78 : static void __init hugetlb_sysfs_init(void)
    2665                 :            : {
    2666                 :         78 :         struct hstate *h;
    2667                 :         78 :         int err;
    2668                 :            : 
    2669                 :         78 :         hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
    2670         [ +  - ]:         78 :         if (!hugepages_kobj)
    2671                 :            :                 return;
    2672                 :            : 
    2673         [ +  + ]:        156 :         for_each_hstate(h) {
    2674                 :         78 :                 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
    2675                 :            :                                          hstate_kobjs, &hstate_attr_group);
    2676         [ -  + ]:         78 :                 if (err)
    2677                 :          0 :                         pr_err("Hugetlb: Unable to add hstate %s", h->name);
    2678                 :            :         }
    2679                 :            : }
    2680                 :            : 
    2681                 :            : #ifdef CONFIG_NUMA
    2682                 :            : 
    2683                 :            : /*
    2684                 :            :  * node_hstate/s - associate per node hstate attributes, via their kobjects,
    2685                 :            :  * with node devices in node_devices[] using a parallel array.  The array
    2686                 :            :  * index of a node device or _hstate == node id.
    2687                 :            :  * This is here to avoid any static dependency of the node device driver, in
    2688                 :            :  * the base kernel, on the hugetlb module.
    2689                 :            :  */
    2690                 :            : struct node_hstate {
    2691                 :            :         struct kobject          *hugepages_kobj;
    2692                 :            :         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
    2693                 :            : };
    2694                 :            : static struct node_hstate node_hstates[MAX_NUMNODES];
    2695                 :            : 
    2696                 :            : /*
    2697                 :            :  * A subset of global hstate attributes for node devices
    2698                 :            :  */
    2699                 :            : static struct attribute *per_node_hstate_attrs[] = {
    2700                 :            :         &nr_hugepages_attr.attr,
    2701                 :            :         &free_hugepages_attr.attr,
    2702                 :            :         &surplus_hugepages_attr.attr,
    2703                 :            :         NULL,
    2704                 :            : };
    2705                 :            : 
    2706                 :            : static const struct attribute_group per_node_hstate_attr_group = {
    2707                 :            :         .attrs = per_node_hstate_attrs,
    2708                 :            : };
    2709                 :            : 
    2710                 :            : /*
    2711                 :            :  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
    2712                 :            :  * Returns node id via non-NULL nidp.
    2713                 :            :  */
    2714                 :          0 : static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
    2715                 :            : {
    2716                 :          0 :         int nid;
    2717                 :            : 
    2718         [ #  # ]:          0 :         for (nid = 0; nid < nr_node_ids; nid++) {
    2719                 :            :                 struct node_hstate *nhs = &node_hstates[nid];
    2720                 :            :                 int i;
    2721         [ #  # ]:          0 :                 for (i = 0; i < HUGE_MAX_HSTATE; i++)
    2722         [ #  # ]:          0 :                         if (nhs->hstate_kobjs[i] == kobj) {
    2723         [ #  # ]:          0 :                                 if (nidp)
    2724                 :          0 :                                         *nidp = nid;
    2725                 :          0 :                                 return &hstates[i];
    2726                 :            :                         }
    2727                 :            :         }
    2728                 :            : 
    2729                 :          0 :         BUG();
    2730                 :            :         return NULL;
    2731                 :            : }
    2732                 :            : 
    2733                 :            : /*
    2734                 :            :  * Unregister hstate attributes from a single node device.
    2735                 :            :  * No-op if no hstate attributes attached.
    2736                 :            :  */
    2737                 :          0 : static void hugetlb_unregister_node(struct node *node)
    2738                 :            : {
    2739                 :          0 :         struct hstate *h;
    2740                 :          0 :         struct node_hstate *nhs = &node_hstates[node->dev.id];
    2741                 :            : 
    2742         [ #  # ]:          0 :         if (!nhs->hugepages_kobj)
    2743                 :            :                 return;         /* no hstate attributes */
    2744                 :            : 
    2745         [ #  # ]:          0 :         for_each_hstate(h) {
    2746         [ #  # ]:          0 :                 int idx = hstate_index(h);
    2747         [ #  # ]:          0 :                 if (nhs->hstate_kobjs[idx]) {
    2748                 :          0 :                         kobject_put(nhs->hstate_kobjs[idx]);
    2749                 :          0 :                         nhs->hstate_kobjs[idx] = NULL;
    2750                 :            :                 }
    2751                 :            :         }
    2752                 :            : 
    2753                 :          0 :         kobject_put(nhs->hugepages_kobj);
    2754                 :          0 :         nhs->hugepages_kobj = NULL;
    2755                 :            : }
    2756                 :            : 
    2757                 :            : 
    2758                 :            : /*
    2759                 :            :  * Register hstate attributes for a single node device.
    2760                 :            :  * No-op if attributes already registered.
    2761                 :            :  */
    2762                 :         78 : static void hugetlb_register_node(struct node *node)
    2763                 :            : {
    2764                 :         78 :         struct hstate *h;
    2765                 :         78 :         struct node_hstate *nhs = &node_hstates[node->dev.id];
    2766                 :         78 :         int err;
    2767                 :            : 
    2768         [ +  - ]:         78 :         if (nhs->hugepages_kobj)
    2769                 :            :                 return;         /* already allocated */
    2770                 :            : 
    2771                 :         78 :         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
    2772                 :            :                                                         &node->dev.kobj);
    2773         [ +  - ]:         78 :         if (!nhs->hugepages_kobj)
    2774                 :            :                 return;
    2775                 :            : 
    2776         [ +  + ]:        156 :         for_each_hstate(h) {
    2777                 :         78 :                 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
    2778                 :         78 :                                                 nhs->hstate_kobjs,
    2779                 :            :                                                 &per_node_hstate_attr_group);
    2780         [ -  + ]:         78 :                 if (err) {
    2781                 :          0 :                         pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
    2782                 :            :                                 h->name, node->dev.id);
    2783                 :          0 :                         hugetlb_unregister_node(node);
    2784                 :          0 :                         break;
    2785                 :            :                 }
    2786                 :            :         }
    2787                 :            : }
    2788                 :            : 
    2789                 :            : /*
    2790                 :            :  * hugetlb init time:  register hstate attributes for all registered node
    2791                 :            :  * devices of nodes that have memory.  All on-line nodes should have
    2792                 :            :  * registered their associated device by this time.
    2793                 :            :  */
    2794                 :         78 : static void __init hugetlb_register_all_nodes(void)
    2795                 :            : {
    2796                 :         78 :         int nid;
    2797                 :            : 
    2798         [ +  + ]:        312 :         for_each_node_state(nid, N_MEMORY) {
    2799                 :         78 :                 struct node *node = node_devices[nid];
    2800         [ +  - ]:         78 :                 if (node->dev.id == nid)
    2801                 :         78 :                         hugetlb_register_node(node);
    2802                 :            :         }
    2803                 :            : 
    2804                 :            :         /*
    2805                 :            :          * Let the node device driver know we're here so it can
    2806                 :            :          * [un]register hstate attributes on node hotplug.
    2807                 :            :          */
    2808                 :         78 :         register_hugetlbfs_with_node(hugetlb_register_node,
    2809                 :            :                                      hugetlb_unregister_node);
    2810                 :         78 : }
    2811                 :            : #else   /* !CONFIG_NUMA */
    2812                 :            : 
    2813                 :            : static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
    2814                 :            : {
    2815                 :            :         BUG();
    2816                 :            :         if (nidp)
    2817                 :            :                 *nidp = -1;
    2818                 :            :         return NULL;
    2819                 :            : }
    2820                 :            : 
    2821                 :            : static void hugetlb_register_all_nodes(void) { }
    2822                 :            : 
    2823                 :            : #endif
    2824                 :            : 
    2825                 :         78 : static int __init hugetlb_init(void)
    2826                 :            : {
    2827                 :         78 :         int i;
    2828                 :            : 
    2829         [ +  - ]:         78 :         if (!hugepages_supported())
    2830                 :            :                 return 0;
    2831                 :            : 
    2832         [ +  - ]:        156 :         if (!size_to_hstate(default_hstate_size)) {
    2833         [ -  + ]:         78 :                 if (default_hstate_size != 0) {
    2834                 :          0 :                         pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
    2835                 :            :                                default_hstate_size, HPAGE_SIZE);
    2836                 :            :                 }
    2837                 :            : 
    2838                 :         78 :                 default_hstate_size = HPAGE_SIZE;
    2839         [ +  - ]:        156 :                 if (!size_to_hstate(default_hstate_size))
    2840                 :         78 :                         hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
    2841                 :            :         }
    2842                 :         78 :         default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
    2843         [ -  + ]:         78 :         if (default_hstate_max_huge_pages) {
    2844         [ #  # ]:          0 :                 if (!default_hstate.max_huge_pages)
    2845                 :          0 :                         default_hstate.max_huge_pages = default_hstate_max_huge_pages;
    2846                 :            :         }
    2847                 :            : 
    2848                 :         78 :         hugetlb_init_hstates();
    2849                 :         78 :         gather_bootmem_prealloc();
    2850                 :         78 :         report_hugepages();
    2851                 :            : 
    2852                 :         78 :         hugetlb_sysfs_init();
    2853                 :         78 :         hugetlb_register_all_nodes();
    2854                 :         78 :         hugetlb_cgroup_file_init();
    2855                 :            : 
    2856                 :            : #ifdef CONFIG_SMP
    2857                 :         78 :         num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
    2858                 :            : #else
    2859                 :            :         num_fault_mutexes = 1;
    2860                 :            : #endif
    2861                 :        156 :         hugetlb_fault_mutex_table =
    2862                 :         78 :                 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
    2863                 :            :                               GFP_KERNEL);
    2864         [ -  + ]:         78 :         BUG_ON(!hugetlb_fault_mutex_table);
    2865                 :            : 
    2866         [ +  + ]:        702 :         for (i = 0; i < num_fault_mutexes; i++)
    2867                 :        624 :                 mutex_init(&hugetlb_fault_mutex_table[i]);
    2868                 :            :         return 0;
    2869                 :            : }
    2870                 :            : subsys_initcall(hugetlb_init);
    2871                 :            : 
    2872                 :            : /* Should be called on processing a hugepagesz=... option */
    2873                 :          0 : void __init hugetlb_bad_size(void)
    2874                 :            : {
    2875                 :          0 :         parsed_valid_hugepagesz = false;
    2876                 :          0 : }
    2877                 :            : 
    2878                 :         78 : void __init hugetlb_add_hstate(unsigned int order)
    2879                 :            : {
    2880                 :         78 :         struct hstate *h;
    2881                 :         78 :         unsigned long i;
    2882                 :            : 
    2883         [ -  + ]:        156 :         if (size_to_hstate(PAGE_SIZE << order)) {
    2884                 :          0 :                 pr_warn("hugepagesz= specified twice, ignoring\n");
    2885                 :          0 :                 return;
    2886                 :            :         }
    2887         [ -  + ]:         78 :         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
    2888         [ -  + ]:         78 :         BUG_ON(order == 0);
    2889                 :         78 :         h = &hstates[hugetlb_max_hstate++];
    2890                 :         78 :         h->order = order;
    2891                 :         78 :         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
    2892                 :         78 :         h->nr_huge_pages = 0;
    2893                 :         78 :         h->free_huge_pages = 0;
    2894         [ +  + ]:       5070 :         for (i = 0; i < MAX_NUMNODES; ++i)
    2895                 :       4992 :                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
    2896                 :         78 :         INIT_LIST_HEAD(&h->hugepage_activelist);
    2897                 :         78 :         h->next_nid_to_alloc = first_memory_node;
    2898                 :         78 :         h->next_nid_to_free = first_memory_node;
    2899                 :         78 :         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
    2900                 :         78 :                                         huge_page_size(h)/1024);
    2901                 :            : 
    2902                 :         78 :         parsed_hstate = h;
    2903                 :            : }
    2904                 :            : 
    2905                 :          0 : static int __init hugetlb_nrpages_setup(char *s)
    2906                 :            : {
    2907                 :          0 :         unsigned long *mhp;
    2908                 :          0 :         static unsigned long *last_mhp;
    2909                 :            : 
    2910         [ #  # ]:          0 :         if (!parsed_valid_hugepagesz) {
    2911                 :          0 :                 pr_warn("hugepages = %s preceded by "
    2912                 :            :                         "an unsupported hugepagesz, ignoring\n", s);
    2913                 :          0 :                 parsed_valid_hugepagesz = true;
    2914                 :          0 :                 return 1;
    2915                 :            :         }
    2916                 :            :         /*
    2917                 :            :          * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
    2918                 :            :          * so this hugepages= parameter goes to the "default hstate".
    2919                 :            :          */
    2920         [ #  # ]:          0 :         else if (!hugetlb_max_hstate)
    2921                 :            :                 mhp = &default_hstate_max_huge_pages;
    2922                 :            :         else
    2923                 :          0 :                 mhp = &parsed_hstate->max_huge_pages;
    2924                 :            : 
    2925         [ #  # ]:          0 :         if (mhp == last_mhp) {
    2926                 :          0 :                 pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
    2927                 :          0 :                 return 1;
    2928                 :            :         }
    2929                 :            : 
    2930         [ #  # ]:          0 :         if (sscanf(s, "%lu", mhp) <= 0)
    2931                 :          0 :                 *mhp = 0;
    2932                 :            : 
    2933                 :            :         /*
    2934                 :            :          * Global state is always initialized later in hugetlb_init.
    2935                 :            :          * But we need to allocate >= MAX_ORDER hstates here early to still
    2936                 :            :          * use the bootmem allocator.
    2937                 :            :          */
    2938   [ #  #  #  # ]:          0 :         if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
    2939                 :          0 :                 hugetlb_hstate_alloc_pages(parsed_hstate);
    2940                 :            : 
    2941                 :          0 :         last_mhp = mhp;
    2942                 :            : 
    2943                 :          0 :         return 1;
    2944                 :            : }
    2945                 :            : __setup("hugepages=", hugetlb_nrpages_setup);
    2946                 :            : 
    2947                 :          0 : static int __init hugetlb_default_setup(char *s)
    2948                 :            : {
    2949                 :          0 :         default_hstate_size = memparse(s, &s);
    2950                 :          0 :         return 1;
    2951                 :            : }
    2952                 :            : __setup("default_hugepagesz=", hugetlb_default_setup);
    2953                 :            : 
    2954                 :          0 : static unsigned int cpuset_mems_nr(unsigned int *array)
    2955                 :            : {
    2956                 :          0 :         int node;
    2957                 :          0 :         unsigned int nr = 0;
    2958                 :            : 
    2959         [ #  # ]:          0 :         for_each_node_mask(node, cpuset_current_mems_allowed)
    2960                 :          0 :                 nr += array[node];
    2961                 :            : 
    2962                 :          0 :         return nr;
    2963                 :            : }
    2964                 :            : 
    2965                 :            : #ifdef CONFIG_SYSCTL
    2966                 :          0 : static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
    2967                 :            :                          struct ctl_table *table, int write,
    2968                 :            :                          void __user *buffer, size_t *length, loff_t *ppos)
    2969                 :            : {
    2970                 :          0 :         struct hstate *h = &default_hstate;
    2971                 :          0 :         unsigned long tmp = h->max_huge_pages;
    2972                 :          0 :         int ret;
    2973                 :            : 
    2974         [ #  # ]:          0 :         if (!hugepages_supported())
    2975                 :            :                 return -EOPNOTSUPP;
    2976                 :            : 
    2977                 :          0 :         table->data = &tmp;
    2978                 :          0 :         table->maxlen = sizeof(unsigned long);
    2979                 :          0 :         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
    2980         [ #  # ]:          0 :         if (ret)
    2981                 :          0 :                 goto out;
    2982                 :            : 
    2983         [ #  # ]:          0 :         if (write)
    2984                 :          0 :                 ret = __nr_hugepages_store_common(obey_mempolicy, h,
    2985                 :            :                                                   NUMA_NO_NODE, tmp, *length);
    2986                 :          0 : out:
    2987                 :            :         return ret;
    2988                 :            : }
    2989                 :            : 
    2990                 :          0 : int hugetlb_sysctl_handler(struct ctl_table *table, int write,
    2991                 :            :                           void __user *buffer, size_t *length, loff_t *ppos)
    2992                 :            : {
    2993                 :            : 
    2994                 :          0 :         return hugetlb_sysctl_handler_common(false, table, write,
    2995                 :            :                                                         buffer, length, ppos);
    2996                 :            : }
    2997                 :            : 
    2998                 :            : #ifdef CONFIG_NUMA
    2999                 :          0 : int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
    3000                 :            :                           void __user *buffer, size_t *length, loff_t *ppos)
    3001                 :            : {
    3002                 :          0 :         return hugetlb_sysctl_handler_common(true, table, write,
    3003                 :            :                                                         buffer, length, ppos);
    3004                 :            : }
    3005                 :            : #endif /* CONFIG_NUMA */
    3006                 :            : 
    3007                 :          0 : int hugetlb_overcommit_handler(struct ctl_table *table, int write,
    3008                 :            :                         void __user *buffer,
    3009                 :            :                         size_t *length, loff_t *ppos)
    3010                 :            : {
    3011                 :          0 :         struct hstate *h = &default_hstate;
    3012                 :          0 :         unsigned long tmp;
    3013                 :          0 :         int ret;
    3014                 :            : 
    3015         [ #  # ]:          0 :         if (!hugepages_supported())
    3016                 :            :                 return -EOPNOTSUPP;
    3017                 :            : 
    3018                 :          0 :         tmp = h->nr_overcommit_huge_pages;
    3019                 :            : 
    3020   [ #  #  #  # ]:          0 :         if (write && hstate_is_gigantic(h))
    3021                 :            :                 return -EINVAL;
    3022                 :            : 
    3023                 :          0 :         table->data = &tmp;
    3024                 :          0 :         table->maxlen = sizeof(unsigned long);
    3025                 :          0 :         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
    3026         [ #  # ]:          0 :         if (ret)
    3027                 :          0 :                 goto out;
    3028                 :            : 
    3029         [ #  # ]:          0 :         if (write) {
    3030                 :          0 :                 spin_lock(&hugetlb_lock);
    3031                 :          0 :                 h->nr_overcommit_huge_pages = tmp;
    3032                 :          0 :                 spin_unlock(&hugetlb_lock);
    3033                 :            :         }
    3034                 :          0 : out:
    3035                 :            :         return ret;
    3036                 :            : }
    3037                 :            : 
    3038                 :            : #endif /* CONFIG_SYSCTL */
    3039                 :            : 
    3040                 :         78 : void hugetlb_report_meminfo(struct seq_file *m)
    3041                 :            : {
    3042                 :         78 :         struct hstate *h;
    3043                 :         78 :         unsigned long total = 0;
    3044                 :            : 
    3045         [ +  - ]:         78 :         if (!hugepages_supported())
    3046                 :            :                 return;
    3047                 :            : 
    3048         [ +  + ]:        156 :         for_each_hstate(h) {
    3049                 :         78 :                 unsigned long count = h->nr_huge_pages;
    3050                 :            : 
    3051         [ +  - ]:         78 :                 total += (PAGE_SIZE << huge_page_order(h)) * count;
    3052                 :            : 
    3053         [ +  - ]:         78 :                 if (h == &default_hstate)
    3054                 :         78 :                         seq_printf(m,
    3055                 :            :                                    "HugePages_Total:   %5lu\n"
    3056                 :            :                                    "HugePages_Free:    %5lu\n"
    3057                 :            :                                    "HugePages_Rsvd:    %5lu\n"
    3058                 :            :                                    "HugePages_Surp:    %5lu\n"
    3059                 :            :                                    "Hugepagesize:   %8lu kB\n",
    3060                 :            :                                    count,
    3061                 :            :                                    h->free_huge_pages,
    3062                 :            :                                    h->resv_huge_pages,
    3063                 :            :                                    h->surplus_huge_pages,
    3064                 :            :                                    (PAGE_SIZE << huge_page_order(h)) / 1024);
    3065                 :            :         }
    3066                 :            : 
    3067                 :         78 :         seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
    3068                 :            : }
    3069                 :            : 
    3070                 :          0 : int hugetlb_report_node_meminfo(int nid, char *buf)
    3071                 :            : {
    3072                 :          0 :         struct hstate *h = &default_hstate;
    3073         [ #  # ]:          0 :         if (!hugepages_supported())
    3074                 :            :                 return 0;
    3075                 :          0 :         return sprintf(buf,
    3076                 :            :                 "Node %d HugePages_Total: %5u\n"
    3077                 :            :                 "Node %d HugePages_Free:  %5u\n"
    3078                 :            :                 "Node %d HugePages_Surp:  %5u\n",
    3079                 :            :                 nid, h->nr_huge_pages_node[nid],
    3080                 :            :                 nid, h->free_huge_pages_node[nid],
    3081                 :            :                 nid, h->surplus_huge_pages_node[nid]);
    3082                 :            : }
    3083                 :            : 
    3084                 :          0 : void hugetlb_show_meminfo(void)
    3085                 :            : {
    3086                 :          0 :         struct hstate *h;
    3087                 :          0 :         int nid;
    3088                 :            : 
    3089         [ #  # ]:          0 :         if (!hugepages_supported())
    3090                 :            :                 return;
    3091                 :            : 
    3092         [ #  # ]:          0 :         for_each_node_state(nid, N_MEMORY)
    3093         [ #  # ]:          0 :                 for_each_hstate(h)
    3094                 :          0 :                         pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
    3095                 :            :                                 nid,
    3096                 :            :                                 h->nr_huge_pages_node[nid],
    3097                 :            :                                 h->free_huge_pages_node[nid],
    3098                 :            :                                 h->surplus_huge_pages_node[nid],
    3099                 :            :                                 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
    3100                 :            : }
    3101                 :            : 
    3102                 :       1763 : void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
    3103                 :            : {
    3104                 :       1763 :         seq_printf(m, "HugetlbPages:\t%8lu kB\n",
    3105                 :       1763 :                    atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
    3106                 :       1763 : }
    3107                 :            : 
    3108                 :            : /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
    3109                 :         78 : unsigned long hugetlb_total_pages(void)
    3110                 :            : {
    3111                 :         78 :         struct hstate *h;
    3112                 :         78 :         unsigned long nr_total_pages = 0;
    3113                 :            : 
    3114         [ +  + ]:        156 :         for_each_hstate(h)
    3115                 :         78 :                 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
    3116                 :         78 :         return nr_total_pages;
    3117                 :            : }
    3118                 :            : 
    3119                 :          0 : static int hugetlb_acct_memory(struct hstate *h, long delta)
    3120                 :            : {
    3121                 :          0 :         int ret = -ENOMEM;
    3122                 :            : 
    3123                 :          0 :         spin_lock(&hugetlb_lock);
    3124                 :            :         /*
    3125                 :            :          * When cpuset is configured, it breaks the strict hugetlb page
    3126                 :            :          * reservation as the accounting is done on a global variable. Such
    3127                 :            :          * reservation is completely rubbish in the presence of cpuset because
    3128                 :            :          * the reservation is not checked against page availability for the
    3129                 :            :          * current cpuset. Application can still potentially OOM'ed by kernel
    3130                 :            :          * with lack of free htlb page in cpuset that the task is in.
    3131                 :            :          * Attempt to enforce strict accounting with cpuset is almost
    3132                 :            :          * impossible (or too ugly) because cpuset is too fluid that
    3133                 :            :          * task or memory node can be dynamically moved between cpusets.
    3134                 :            :          *
    3135                 :            :          * The change of semantics for shared hugetlb mapping with cpuset is
    3136                 :            :          * undesirable. However, in order to preserve some of the semantics,
    3137                 :            :          * we fall back to check against current free page availability as
    3138                 :            :          * a best attempt and hopefully to minimize the impact of changing
    3139                 :            :          * semantics that cpuset has.
    3140                 :            :          */
    3141         [ #  # ]:          0 :         if (delta > 0) {
    3142         [ #  # ]:          0 :                 if (gather_surplus_pages(h, delta) < 0)
    3143                 :          0 :                         goto out;
    3144                 :            : 
    3145         [ #  # ]:          0 :                 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
    3146                 :          0 :                         return_unused_surplus_pages(h, delta);
    3147                 :          0 :                         goto out;
    3148                 :            :                 }
    3149                 :            :         }
    3150                 :            : 
    3151                 :          0 :         ret = 0;
    3152         [ #  # ]:          0 :         if (delta < 0)
    3153                 :          0 :                 return_unused_surplus_pages(h, (unsigned long) -delta);
    3154                 :            : 
    3155                 :          0 : out:
    3156                 :          0 :         spin_unlock(&hugetlb_lock);
    3157                 :          0 :         return ret;
    3158                 :            : }
    3159                 :            : 
    3160                 :          0 : static void hugetlb_vm_op_open(struct vm_area_struct *vma)
    3161                 :            : {
    3162         [ #  # ]:          0 :         struct resv_map *resv = vma_resv_map(vma);
    3163                 :            : 
    3164                 :            :         /*
    3165                 :            :          * This new VMA should share its siblings reservation map if present.
    3166                 :            :          * The VMA will only ever have a valid reservation map pointer where
    3167                 :            :          * it is being copied for another still existing VMA.  As that VMA
    3168                 :            :          * has a reference to the reservation map it cannot disappear until
    3169                 :            :          * after this open call completes.  It is therefore safe to take a
    3170                 :            :          * new reference here without additional locking.
    3171                 :            :          */
    3172   [ #  #  #  # ]:          0 :         if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
    3173                 :          0 :                 kref_get(&resv->refs);
    3174                 :          0 : }
    3175                 :            : 
    3176                 :          0 : static void hugetlb_vm_op_close(struct vm_area_struct *vma)
    3177                 :            : {
    3178         [ #  # ]:          0 :         struct hstate *h = hstate_vma(vma);
    3179         [ #  # ]:          0 :         struct resv_map *resv = vma_resv_map(vma);
    3180         [ #  # ]:          0 :         struct hugepage_subpool *spool = subpool_vma(vma);
    3181                 :          0 :         unsigned long reserve, start, end;
    3182                 :          0 :         long gbl_reserve;
    3183                 :            : 
    3184   [ #  #  #  # ]:          0 :         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
    3185                 :            :                 return;
    3186                 :            : 
    3187                 :          0 :         start = vma_hugecache_offset(h, vma, vma->vm_start);
    3188                 :          0 :         end = vma_hugecache_offset(h, vma, vma->vm_end);
    3189                 :            : 
    3190                 :          0 :         reserve = (end - start) - region_count(resv, start, end);
    3191                 :            : 
    3192                 :          0 :         kref_put(&resv->refs, resv_map_release);
    3193                 :            : 
    3194         [ #  # ]:          0 :         if (reserve) {
    3195                 :            :                 /*
    3196                 :            :                  * Decrement reserve counts.  The global reserve count may be
    3197                 :            :                  * adjusted if the subpool has a minimum size.
    3198                 :            :                  */
    3199                 :          0 :                 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
    3200                 :          0 :                 hugetlb_acct_memory(h, -gbl_reserve);
    3201                 :            :         }
    3202                 :            : }
    3203                 :            : 
    3204                 :          0 : static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
    3205                 :            : {
    3206         [ #  # ]:          0 :         if (addr & ~(huge_page_mask(hstate_vma(vma))))
    3207                 :          0 :                 return -EINVAL;
    3208                 :            :         return 0;
    3209                 :            : }
    3210                 :            : 
    3211                 :          0 : static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
    3212                 :            : {
    3213                 :          0 :         struct hstate *hstate = hstate_vma(vma);
    3214                 :            : 
    3215                 :          0 :         return 1UL << huge_page_shift(hstate);
    3216                 :            : }
    3217                 :            : 
    3218                 :            : /*
    3219                 :            :  * We cannot handle pagefaults against hugetlb pages at all.  They cause
    3220                 :            :  * handle_mm_fault() to try to instantiate regular-sized pages in the
    3221                 :            :  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
    3222                 :            :  * this far.
    3223                 :            :  */
    3224                 :          0 : static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
    3225                 :            : {
    3226                 :          0 :         BUG();
    3227                 :            :         return 0;
    3228                 :            : }
    3229                 :            : 
    3230                 :            : /*
    3231                 :            :  * When a new function is introduced to vm_operations_struct and added
    3232                 :            :  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
    3233                 :            :  * This is because under System V memory model, mappings created via
    3234                 :            :  * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
    3235                 :            :  * their original vm_ops are overwritten with shm_vm_ops.
    3236                 :            :  */
    3237                 :            : const struct vm_operations_struct hugetlb_vm_ops = {
    3238                 :            :         .fault = hugetlb_vm_op_fault,
    3239                 :            :         .open = hugetlb_vm_op_open,
    3240                 :            :         .close = hugetlb_vm_op_close,
    3241                 :            :         .split = hugetlb_vm_op_split,
    3242                 :            :         .pagesize = hugetlb_vm_op_pagesize,
    3243                 :            : };
    3244                 :            : 
    3245                 :          0 : static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
    3246                 :            :                                 int writable)
    3247                 :            : {
    3248                 :          0 :         pte_t entry;
    3249                 :            : 
    3250                 :          0 :         if (writable) {
    3251                 :          0 :                 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
    3252                 :            :                                          vma->vm_page_prot)));
    3253                 :            :         } else {
    3254                 :            :                 entry = huge_pte_wrprotect(mk_huge_pte(page,
    3255                 :            :                                            vma->vm_page_prot));
    3256                 :            :         }
    3257                 :          0 :         entry = pte_mkyoung(entry);
    3258                 :          0 :         entry = pte_mkhuge(entry);
    3259                 :          0 :         entry = arch_make_huge_pte(entry, vma, page, writable);
    3260                 :            : 
    3261                 :          0 :         return entry;
    3262                 :            : }
    3263                 :            : 
    3264                 :          0 : static void set_huge_ptep_writable(struct vm_area_struct *vma,
    3265                 :            :                                    unsigned long address, pte_t *ptep)
    3266                 :            : {
    3267                 :          0 :         pte_t entry;
    3268                 :            : 
    3269                 :          0 :         entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
    3270                 :          0 :         if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
    3271                 :            :                 update_mmu_cache(vma, address, ptep);
    3272                 :            : }
    3273                 :            : 
    3274                 :          0 : bool is_hugetlb_entry_migration(pte_t pte)
    3275                 :            : {
    3276                 :          0 :         swp_entry_t swp;
    3277                 :            : 
    3278   [ #  #  #  # ]:          0 :         if (huge_pte_none(pte) || pte_present(pte))
    3279                 :            :                 return false;
    3280         [ #  # ]:          0 :         swp = pte_to_swp_entry(pte);
    3281         [ #  # ]:          0 :         if (non_swap_entry(swp) && is_migration_entry(swp))
    3282                 :            :                 return true;
    3283                 :            :         else
    3284                 :          0 :                 return false;
    3285                 :            : }
    3286                 :            : 
    3287                 :          0 : static int is_hugetlb_entry_hwpoisoned(pte_t pte)
    3288                 :            : {
    3289                 :          0 :         swp_entry_t swp;
    3290                 :            : 
    3291         [ #  # ]:          0 :         if (huge_pte_none(pte) || pte_present(pte))
    3292                 :          0 :                 return 0;
    3293                 :            :         swp = pte_to_swp_entry(pte);
    3294                 :            :         if (non_swap_entry(swp) && is_hwpoison_entry(swp))
    3295                 :            :                 return 1;
    3296                 :            :         else
    3297                 :            :                 return 0;
    3298                 :            : }
    3299                 :            : 
    3300                 :          0 : int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
    3301                 :            :                             struct vm_area_struct *vma)
    3302                 :            : {
    3303                 :          0 :         pte_t *src_pte, *dst_pte, entry, dst_entry;
    3304                 :          0 :         struct page *ptepage;
    3305                 :          0 :         unsigned long addr;
    3306                 :          0 :         int cow;
    3307         [ #  # ]:          0 :         struct hstate *h = hstate_vma(vma);
    3308         [ #  # ]:          0 :         unsigned long sz = huge_page_size(h);
    3309                 :          0 :         struct mmu_notifier_range range;
    3310                 :          0 :         int ret = 0;
    3311                 :            : 
    3312                 :          0 :         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
    3313                 :            : 
    3314         [ #  # ]:          0 :         if (cow) {
    3315                 :          0 :                 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
    3316                 :            :                                         vma->vm_start,
    3317                 :            :                                         vma->vm_end);
    3318                 :          0 :                 mmu_notifier_invalidate_range_start(&range);
    3319                 :            :         }
    3320                 :            : 
    3321         [ #  # ]:          0 :         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
    3322                 :          0 :                 spinlock_t *src_ptl, *dst_ptl;
    3323                 :          0 :                 src_pte = huge_pte_offset(src, addr, sz);
    3324         [ #  # ]:          0 :                 if (!src_pte)
    3325                 :          0 :                         continue;
    3326                 :          0 :                 dst_pte = huge_pte_alloc(dst, addr, sz);
    3327         [ #  # ]:          0 :                 if (!dst_pte) {
    3328                 :            :                         ret = -ENOMEM;
    3329                 :            :                         break;
    3330                 :            :                 }
    3331                 :            : 
    3332                 :            :                 /*
    3333                 :            :                  * If the pagetables are shared don't copy or take references.
    3334                 :            :                  * dst_pte == src_pte is the common case of src/dest sharing.
    3335                 :            :                  *
    3336                 :            :                  * However, src could have 'unshared' and dst shares with
    3337                 :            :                  * another vma.  If dst_pte !none, this implies sharing.
    3338                 :            :                  * Check here before taking page table lock, and once again
    3339                 :            :                  * after taking the lock below.
    3340                 :            :                  */
    3341         [ #  # ]:          0 :                 dst_entry = huge_ptep_get(dst_pte);
    3342   [ #  #  #  # ]:          0 :                 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
    3343                 :          0 :                         continue;
    3344                 :            : 
    3345                 :          0 :                 dst_ptl = huge_pte_lock(h, dst, dst_pte);
    3346         [ #  # ]:          0 :                 src_ptl = huge_pte_lockptr(h, src, src_pte);
    3347                 :          0 :                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
    3348         [ #  # ]:          0 :                 entry = huge_ptep_get(src_pte);
    3349                 :          0 :                 dst_entry = huge_ptep_get(dst_pte);
    3350   [ #  #  #  # ]:          0 :                 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
    3351                 :            :                         /*
    3352                 :            :                          * Skip if src entry none.  Also, skip in the
    3353                 :            :                          * unlikely case dst entry !none as this implies
    3354                 :            :                          * sharing with another vma.
    3355                 :            :                          */
    3356                 :            :                         ;
    3357         [ #  # ]:          0 :                 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
    3358                 :          0 :                                     is_hugetlb_entry_hwpoisoned(entry))) {
    3359         [ #  # ]:          0 :                         swp_entry_t swp_entry = pte_to_swp_entry(entry);
    3360                 :            : 
    3361   [ #  #  #  # ]:          0 :                         if (is_write_migration_entry(swp_entry) && cow) {
    3362                 :            :                                 /*
    3363                 :            :                                  * COW mappings require pages in both
    3364                 :            :                                  * parent and child to be set to read.
    3365                 :            :                                  */
    3366                 :          0 :                                 make_migration_entry_read(&swp_entry);
    3367                 :          0 :                                 entry = swp_entry_to_pte(swp_entry);
    3368                 :          0 :                                 set_huge_swap_pte_at(src, addr, src_pte,
    3369                 :            :                                                      entry, sz);
    3370                 :            :                         }
    3371                 :          0 :                         set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
    3372                 :            :                 } else {
    3373         [ #  # ]:          0 :                         if (cow) {
    3374                 :            :                                 /*
    3375                 :            :                                  * No need to notify as we are downgrading page
    3376                 :            :                                  * table protection not changing it to point
    3377                 :            :                                  * to a new page.
    3378                 :            :                                  *
    3379                 :            :                                  * See Documentation/vm/mmu_notifier.rst
    3380                 :            :                                  */
    3381                 :          0 :                                 huge_ptep_set_wrprotect(src, addr, src_pte);
    3382                 :            :                         }
    3383         [ #  # ]:          0 :                         entry = huge_ptep_get(src_pte);
    3384         [ #  # ]:          0 :                         ptepage = pte_page(entry);
    3385         [ #  # ]:          0 :                         get_page(ptepage);
    3386                 :          0 :                         page_dup_rmap(ptepage, true);
    3387                 :          0 :                         set_huge_pte_at(dst, addr, dst_pte, entry);
    3388                 :          0 :                         hugetlb_count_add(pages_per_huge_page(h), dst);
    3389                 :            :                 }
    3390                 :          0 :                 spin_unlock(src_ptl);
    3391                 :          0 :                 spin_unlock(dst_ptl);
    3392                 :            :         }
    3393                 :            : 
    3394         [ #  # ]:          0 :         if (cow)
    3395                 :          0 :                 mmu_notifier_invalidate_range_end(&range);
    3396                 :            : 
    3397                 :          0 :         return ret;
    3398                 :            : }
    3399                 :            : 
    3400                 :          0 : void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
    3401                 :            :                             unsigned long start, unsigned long end,
    3402                 :            :                             struct page *ref_page)
    3403                 :            : {
    3404                 :          0 :         struct mm_struct *mm = vma->vm_mm;
    3405                 :          0 :         unsigned long address;
    3406                 :          0 :         pte_t *ptep;
    3407                 :          0 :         pte_t pte;
    3408                 :          0 :         spinlock_t *ptl;
    3409                 :          0 :         struct page *page;
    3410         [ #  # ]:          0 :         struct hstate *h = hstate_vma(vma);
    3411         [ #  # ]:          0 :         unsigned long sz = huge_page_size(h);
    3412                 :          0 :         struct mmu_notifier_range range;
    3413                 :            : 
    3414         [ #  # ]:          0 :         WARN_ON(!is_vm_hugetlb_page(vma));
    3415         [ #  # ]:          0 :         BUG_ON(start & ~huge_page_mask(h));
    3416         [ #  # ]:          0 :         BUG_ON(end & ~huge_page_mask(h));
    3417                 :            : 
    3418                 :            :         /*
    3419                 :            :          * This is a hugetlb vma, all the pte entries should point
    3420                 :            :          * to huge page.
    3421                 :            :          */
    3422                 :          0 :         tlb_change_page_size(tlb, sz);
    3423                 :          0 :         tlb_start_vma(tlb, vma);
    3424                 :            : 
    3425                 :            :         /*
    3426                 :            :          * If sharing possible, alert mmu notifiers of worst case.
    3427                 :            :          */
    3428                 :          0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
    3429                 :            :                                 end);
    3430                 :          0 :         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
    3431                 :          0 :         mmu_notifier_invalidate_range_start(&range);
    3432                 :          0 :         address = start;
    3433         [ #  # ]:          0 :         for (; address < end; address += sz) {
    3434                 :          0 :                 ptep = huge_pte_offset(mm, address, sz);
    3435         [ #  # ]:          0 :                 if (!ptep)
    3436                 :          0 :                         continue;
    3437                 :            : 
    3438                 :          0 :                 ptl = huge_pte_lock(h, mm, ptep);
    3439         [ #  # ]:          0 :                 if (huge_pmd_unshare(mm, &address, ptep)) {
    3440                 :          0 :                         spin_unlock(ptl);
    3441                 :            :                         /*
    3442                 :            :                          * We just unmapped a page of PMDs by clearing a PUD.
    3443                 :            :                          * The caller's TLB flush range should cover this area.
    3444                 :            :                          */
    3445                 :          0 :                         continue;
    3446                 :            :                 }
    3447                 :            : 
    3448         [ #  # ]:          0 :                 pte = huge_ptep_get(ptep);
    3449         [ #  # ]:          0 :                 if (huge_pte_none(pte)) {
    3450                 :          0 :                         spin_unlock(ptl);
    3451                 :          0 :                         continue;
    3452                 :            :                 }
    3453                 :            : 
    3454                 :            :                 /*
    3455                 :            :                  * Migrating hugepage or HWPoisoned hugepage is already
    3456                 :            :                  * unmapped and its refcount is dropped, so just clear pte here.
    3457                 :            :                  */
    3458         [ #  # ]:          0 :                 if (unlikely(!pte_present(pte))) {
    3459                 :          0 :                         huge_pte_clear(mm, address, ptep, sz);
    3460                 :          0 :                         spin_unlock(ptl);
    3461                 :          0 :                         continue;
    3462                 :            :                 }
    3463                 :            : 
    3464         [ #  # ]:          0 :                 page = pte_page(pte);
    3465                 :            :                 /*
    3466                 :            :                  * If a reference page is supplied, it is because a specific
    3467                 :            :                  * page is being unmapped, not a range. Ensure the page we
    3468                 :            :                  * are about to unmap is the actual page of interest.
    3469                 :            :                  */
    3470         [ #  # ]:          0 :                 if (ref_page) {
    3471         [ #  # ]:          0 :                         if (page != ref_page) {
    3472                 :          0 :                                 spin_unlock(ptl);
    3473                 :          0 :                                 continue;
    3474                 :            :                         }
    3475                 :            :                         /*
    3476                 :            :                          * Mark the VMA as having unmapped its page so that
    3477                 :            :                          * future faults in this VMA will fail rather than
    3478                 :            :                          * looking like data was lost
    3479                 :            :                          */
    3480                 :          0 :                         set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
    3481                 :            :                 }
    3482                 :            : 
    3483                 :          0 :                 pte = huge_ptep_get_and_clear(mm, address, ptep);
    3484   [ #  #  #  # ]:          0 :                 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
    3485         [ #  # ]:          0 :                 if (huge_pte_dirty(pte))
    3486                 :          0 :                         set_page_dirty(page);
    3487                 :            : 
    3488                 :          0 :                 hugetlb_count_sub(pages_per_huge_page(h), mm);
    3489                 :          0 :                 page_remove_rmap(page, true);
    3490                 :            : 
    3491                 :          0 :                 spin_unlock(ptl);
    3492                 :          0 :                 tlb_remove_page_size(tlb, page, huge_page_size(h));
    3493                 :            :                 /*
    3494                 :            :                  * Bail out after unmapping reference page if supplied
    3495                 :            :                  */
    3496         [ #  # ]:          0 :                 if (ref_page)
    3497                 :            :                         break;
    3498                 :            :         }
    3499                 :          0 :         mmu_notifier_invalidate_range_end(&range);
    3500                 :          0 :         tlb_end_vma(tlb, vma);
    3501                 :          0 : }
    3502                 :            : 
    3503                 :          0 : void __unmap_hugepage_range_final(struct mmu_gather *tlb,
    3504                 :            :                           struct vm_area_struct *vma, unsigned long start,
    3505                 :            :                           unsigned long end, struct page *ref_page)
    3506                 :            : {
    3507                 :          0 :         __unmap_hugepage_range(tlb, vma, start, end, ref_page);
    3508                 :            : 
    3509                 :            :         /*
    3510                 :            :          * Clear this flag so that x86's huge_pmd_share page_table_shareable
    3511                 :            :          * test will fail on a vma being torn down, and not grab a page table
    3512                 :            :          * on its way out.  We're lucky that the flag has such an appropriate
    3513                 :            :          * name, and can in fact be safely cleared here. We could clear it
    3514                 :            :          * before the __unmap_hugepage_range above, but all that's necessary
    3515                 :            :          * is to clear it before releasing the i_mmap_rwsem. This works
    3516                 :            :          * because in the context this is called, the VMA is about to be
    3517                 :            :          * destroyed and the i_mmap_rwsem is held.
    3518                 :            :          */
    3519                 :          0 :         vma->vm_flags &= ~VM_MAYSHARE;
    3520                 :          0 : }
    3521                 :            : 
    3522                 :          0 : void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
    3523                 :            :                           unsigned long end, struct page *ref_page)
    3524                 :            : {
    3525                 :          0 :         struct mm_struct *mm;
    3526                 :          0 :         struct mmu_gather tlb;
    3527                 :          0 :         unsigned long tlb_start = start;
    3528                 :          0 :         unsigned long tlb_end = end;
    3529                 :            : 
    3530                 :            :         /*
    3531                 :            :          * If shared PMDs were possibly used within this vma range, adjust
    3532                 :            :          * start/end for worst case tlb flushing.
    3533                 :            :          * Note that we can not be sure if PMDs are shared until we try to
    3534                 :            :          * unmap pages.  However, we want to make sure TLB flushing covers
    3535                 :            :          * the largest possible range.
    3536                 :            :          */
    3537                 :          0 :         adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
    3538                 :            : 
    3539                 :          0 :         mm = vma->vm_mm;
    3540                 :            : 
    3541                 :          0 :         tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
    3542                 :          0 :         __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
    3543                 :          0 :         tlb_finish_mmu(&tlb, tlb_start, tlb_end);
    3544                 :          0 : }
    3545                 :            : 
    3546                 :            : /*
    3547                 :            :  * This is called when the original mapper is failing to COW a MAP_PRIVATE
    3548                 :            :  * mappping it owns the reserve page for. The intention is to unmap the page
    3549                 :            :  * from other VMAs and let the children be SIGKILLed if they are faulting the
    3550                 :            :  * same region.
    3551                 :            :  */
    3552                 :            : static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
    3553                 :            :                               struct page *page, unsigned long address)
    3554                 :            : {
    3555                 :            :         struct hstate *h = hstate_vma(vma);
    3556                 :            :         struct vm_area_struct *iter_vma;
    3557                 :            :         struct address_space *mapping;
    3558                 :            :         pgoff_t pgoff;
    3559                 :            : 
    3560                 :            :         /*
    3561                 :            :          * vm_pgoff is in PAGE_SIZE units, hence the different calculation
    3562                 :            :          * from page cache lookup which is in HPAGE_SIZE units.
    3563                 :            :          */
    3564                 :            :         address = address & huge_page_mask(h);
    3565                 :            :         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
    3566                 :            :                         vma->vm_pgoff;
    3567                 :            :         mapping = vma->vm_file->f_mapping;
    3568                 :            : 
    3569                 :            :         /*
    3570                 :            :          * Take the mapping lock for the duration of the table walk. As
    3571                 :            :          * this mapping should be shared between all the VMAs,
    3572                 :            :          * __unmap_hugepage_range() is called as the lock is already held
    3573                 :            :          */
    3574                 :            :         i_mmap_lock_write(mapping);
    3575                 :            :         vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
    3576                 :            :                 /* Do not unmap the current VMA */
    3577                 :            :                 if (iter_vma == vma)
    3578                 :            :                         continue;
    3579                 :            : 
    3580                 :            :                 /*
    3581                 :            :                  * Shared VMAs have their own reserves and do not affect
    3582                 :            :                  * MAP_PRIVATE accounting but it is possible that a shared
    3583                 :            :                  * VMA is using the same page so check and skip such VMAs.
    3584                 :            :                  */
    3585                 :            :                 if (iter_vma->vm_flags & VM_MAYSHARE)
    3586                 :            :                         continue;
    3587                 :            : 
    3588                 :            :                 /*
    3589                 :            :                  * Unmap the page from other VMAs without their own reserves.
    3590                 :            :                  * They get marked to be SIGKILLed if they fault in these
    3591                 :            :                  * areas. This is because a future no-page fault on this VMA
    3592                 :            :                  * could insert a zeroed page instead of the data existing
    3593                 :            :                  * from the time of fork. This would look like data corruption
    3594                 :            :                  */
    3595                 :            :                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
    3596                 :            :                         unmap_hugepage_range(iter_vma, address,
    3597                 :            :                                              address + huge_page_size(h), page);
    3598                 :            :         }
    3599                 :            :         i_mmap_unlock_write(mapping);
    3600                 :            : }
    3601                 :            : 
    3602                 :            : /*
    3603                 :            :  * Hugetlb_cow() should be called with page lock of the original hugepage held.
    3604                 :            :  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
    3605                 :            :  * cannot race with other handlers or page migration.
    3606                 :            :  * Keep the pte_same checks anyway to make transition from the mutex easier.
    3607                 :            :  */
    3608                 :          0 : static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
    3609                 :            :                        unsigned long address, pte_t *ptep,
    3610                 :            :                        struct page *pagecache_page, spinlock_t *ptl)
    3611                 :            : {
    3612                 :          0 :         pte_t pte;
    3613         [ #  # ]:          0 :         struct hstate *h = hstate_vma(vma);
    3614                 :          0 :         struct page *old_page, *new_page;
    3615                 :          0 :         int outside_reserve = 0;
    3616                 :          0 :         vm_fault_t ret = 0;
    3617         [ #  # ]:          0 :         unsigned long haddr = address & huge_page_mask(h);
    3618                 :          0 :         struct mmu_notifier_range range;
    3619                 :            : 
    3620         [ #  # ]:          0 :         pte = huge_ptep_get(ptep);
    3621         [ #  # ]:          0 :         old_page = pte_page(pte);
    3622                 :            : 
    3623                 :          0 : retry_avoidcopy:
    3624                 :            :         /* If no-one else is actually using this page, avoid the copy
    3625                 :            :          * and just make the page writable */
    3626   [ #  #  #  # ]:          0 :         if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
    3627                 :          0 :                 page_move_anon_rmap(old_page, vma);
    3628                 :          0 :                 set_huge_ptep_writable(vma, haddr, ptep);
    3629                 :          0 :                 return 0;
    3630                 :            :         }
    3631                 :            : 
    3632                 :            :         /*
    3633                 :            :          * If the process that created a MAP_PRIVATE mapping is about to
    3634                 :            :          * perform a COW due to a shared page count, attempt to satisfy
    3635                 :            :          * the allocation without using the existing reserves. The pagecache
    3636                 :            :          * page is used to determine if the reserve at this address was
    3637                 :            :          * consumed or not. If reserves were used, a partial faulted mapping
    3638                 :            :          * at the time of fork() could consume its reserves on COW instead
    3639                 :            :          * of the full address range.
    3640                 :            :          */
    3641   [ #  #  #  # ]:          0 :         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
    3642                 :            :                         old_page != pagecache_page)
    3643                 :          0 :                 outside_reserve = 1;
    3644                 :            : 
    3645         [ #  # ]:          0 :         get_page(old_page);
    3646                 :            : 
    3647                 :            :         /*
    3648                 :            :          * Drop page table lock as buddy allocator may be called. It will
    3649                 :            :          * be acquired again before returning to the caller, as expected.
    3650                 :            :          */
    3651                 :          0 :         spin_unlock(ptl);
    3652                 :          0 :         new_page = alloc_huge_page(vma, haddr, outside_reserve);
    3653                 :            : 
    3654         [ #  # ]:          0 :         if (IS_ERR(new_page)) {
    3655                 :            :                 /*
    3656                 :            :                  * If a process owning a MAP_PRIVATE mapping fails to COW,
    3657                 :            :                  * it is due to references held by a child and an insufficient
    3658                 :            :                  * huge page pool. To guarantee the original mappers
    3659                 :            :                  * reliability, unmap the page from child processes. The child
    3660                 :            :                  * may get SIGKILLed if it later faults.
    3661                 :            :                  */
    3662         [ #  # ]:          0 :                 if (outside_reserve) {
    3663                 :          0 :                         put_page(old_page);
    3664         [ #  # ]:          0 :                         BUG_ON(huge_pte_none(pte));
    3665                 :          0 :                         unmap_ref_private(mm, vma, old_page, haddr);
    3666                 :          0 :                         BUG_ON(huge_pte_none(pte));
    3667                 :          0 :                         spin_lock(ptl);
    3668                 :          0 :                         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
    3669   [ #  #  #  # ]:          0 :                         if (likely(ptep &&
    3670                 :            :                                    pte_same(huge_ptep_get(ptep), pte)))
    3671                 :          0 :                                 goto retry_avoidcopy;
    3672                 :            :                         /*
    3673                 :            :                          * race occurs while re-acquiring page table
    3674                 :            :                          * lock, and our job is done.
    3675                 :            :                          */
    3676                 :            :                         return 0;
    3677                 :            :                 }
    3678                 :            : 
    3679         [ #  # ]:          0 :                 ret = vmf_error(PTR_ERR(new_page));
    3680                 :          0 :                 goto out_release_old;
    3681                 :            :         }
    3682                 :            : 
    3683                 :            :         /*
    3684                 :            :          * When the original hugepage is shared one, it does not have
    3685                 :            :          * anon_vma prepared.
    3686                 :            :          */
    3687   [ #  #  #  # ]:          0 :         if (unlikely(anon_vma_prepare(vma))) {
    3688                 :          0 :                 ret = VM_FAULT_OOM;
    3689                 :          0 :                 goto out_release_all;
    3690                 :            :         }
    3691                 :            : 
    3692                 :          0 :         copy_user_huge_page(new_page, old_page, address, vma,
    3693                 :            :                             pages_per_huge_page(h));
    3694                 :          0 :         __SetPageUptodate(new_page);
    3695                 :            : 
    3696                 :          0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
    3697                 :          0 :                                 haddr + huge_page_size(h));
    3698                 :          0 :         mmu_notifier_invalidate_range_start(&range);
    3699                 :            : 
    3700                 :            :         /*
    3701                 :            :          * Retake the page table lock to check for racing updates
    3702                 :            :          * before the page tables are altered
    3703                 :            :          */
    3704                 :          0 :         spin_lock(ptl);
    3705                 :          0 :         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
    3706   [ #  #  #  # ]:          0 :         if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
    3707                 :          0 :                 ClearPagePrivate(new_page);
    3708                 :            : 
    3709                 :            :                 /* Break COW */
    3710                 :          0 :                 huge_ptep_clear_flush(vma, haddr, ptep);
    3711         [ #  # ]:          0 :                 mmu_notifier_invalidate_range(mm, range.start, range.end);
    3712                 :          0 :                 set_huge_pte_at(mm, haddr, ptep,
    3713                 :            :                                 make_huge_pte(vma, new_page, 1));
    3714                 :          0 :                 page_remove_rmap(old_page, true);
    3715                 :          0 :                 hugepage_add_new_anon_rmap(new_page, vma, haddr);
    3716                 :          0 :                 set_page_huge_active(new_page);
    3717                 :            :                 /* Make the old page be freed below */
    3718                 :          0 :                 new_page = old_page;
    3719                 :            :         }
    3720                 :          0 :         spin_unlock(ptl);
    3721                 :          0 :         mmu_notifier_invalidate_range_end(&range);
    3722                 :          0 : out_release_all:
    3723                 :          0 :         restore_reserve_on_error(h, vma, haddr, new_page);
    3724                 :          0 :         put_page(new_page);
    3725                 :          0 : out_release_old:
    3726                 :          0 :         put_page(old_page);
    3727                 :            : 
    3728                 :          0 :         spin_lock(ptl); /* Caller expects lock to be held */
    3729                 :          0 :         return ret;
    3730                 :            : }
    3731                 :            : 
    3732                 :            : /* Return the pagecache page at a given address within a VMA */
    3733                 :          0 : static struct page *hugetlbfs_pagecache_page(struct hstate *h,
    3734                 :            :                         struct vm_area_struct *vma, unsigned long address)
    3735                 :            : {
    3736                 :          0 :         struct address_space *mapping;
    3737                 :          0 :         pgoff_t idx;
    3738                 :            : 
    3739                 :          0 :         mapping = vma->vm_file->f_mapping;
    3740                 :          0 :         idx = vma_hugecache_offset(h, vma, address);
    3741                 :            : 
    3742                 :          0 :         return find_lock_page(mapping, idx);
    3743                 :            : }
    3744                 :            : 
    3745                 :            : /*
    3746                 :            :  * Return whether there is a pagecache page to back given address within VMA.
    3747                 :            :  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
    3748                 :            :  */
    3749                 :          0 : static bool hugetlbfs_pagecache_present(struct hstate *h,
    3750                 :            :                         struct vm_area_struct *vma, unsigned long address)
    3751                 :            : {
    3752                 :          0 :         struct address_space *mapping;
    3753                 :          0 :         pgoff_t idx;
    3754                 :          0 :         struct page *page;
    3755                 :            : 
    3756                 :          0 :         mapping = vma->vm_file->f_mapping;
    3757                 :          0 :         idx = vma_hugecache_offset(h, vma, address);
    3758                 :            : 
    3759                 :          0 :         page = find_get_page(mapping, idx);
    3760         [ #  # ]:          0 :         if (page)
    3761                 :          0 :                 put_page(page);
    3762                 :          0 :         return page != NULL;
    3763                 :            : }
    3764                 :            : 
    3765                 :          0 : int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
    3766                 :            :                            pgoff_t idx)
    3767                 :            : {
    3768                 :          0 :         struct inode *inode = mapping->host;
    3769                 :          0 :         struct hstate *h = hstate_inode(inode);
    3770                 :          0 :         int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
    3771                 :            : 
    3772         [ #  # ]:          0 :         if (err)
    3773                 :            :                 return err;
    3774                 :          0 :         ClearPagePrivate(page);
    3775                 :            : 
    3776                 :            :         /*
    3777                 :            :          * set page dirty so that it will not be removed from cache/file
    3778                 :            :          * by non-hugetlbfs specific code paths.
    3779                 :            :          */
    3780                 :          0 :         set_page_dirty(page);
    3781                 :            : 
    3782                 :          0 :         spin_lock(&inode->i_lock);
    3783                 :          0 :         inode->i_blocks += blocks_per_huge_page(h);
    3784                 :          0 :         spin_unlock(&inode->i_lock);
    3785                 :          0 :         return 0;
    3786                 :            : }
    3787                 :            : 
    3788                 :          0 : static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
    3789                 :            :                         struct vm_area_struct *vma,
    3790                 :            :                         struct address_space *mapping, pgoff_t idx,
    3791                 :            :                         unsigned long address, pte_t *ptep, unsigned int flags)
    3792                 :            : {
    3793         [ #  # ]:          0 :         struct hstate *h = hstate_vma(vma);
    3794                 :          0 :         vm_fault_t ret = VM_FAULT_SIGBUS;
    3795                 :          0 :         int anon_rmap = 0;
    3796                 :          0 :         unsigned long size;
    3797                 :          0 :         struct page *page;
    3798                 :          0 :         pte_t new_pte;
    3799                 :          0 :         spinlock_t *ptl;
    3800         [ #  # ]:          0 :         unsigned long haddr = address & huge_page_mask(h);
    3801                 :          0 :         bool new_page = false;
    3802                 :            : 
    3803                 :            :         /*
    3804                 :            :          * Currently, we are forced to kill the process in the event the
    3805                 :            :          * original mapper has unmapped pages from the child due to a failed
    3806                 :            :          * COW. Warn that such a situation has occurred as it may not be obvious
    3807                 :            :          */
    3808         [ #  # ]:          0 :         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
    3809         [ #  # ]:          0 :                 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
    3810                 :            :                            current->pid);
    3811                 :          0 :                 return ret;
    3812                 :            :         }
    3813                 :            : 
    3814                 :            :         /*
    3815                 :            :          * Use page lock to guard against racing truncation
    3816                 :            :          * before we get page_table_lock.
    3817                 :            :          */
    3818                 :          0 : retry:
    3819                 :          0 :         page = find_lock_page(mapping, idx);
    3820         [ #  # ]:          0 :         if (!page) {
    3821         [ #  # ]:          0 :                 size = i_size_read(mapping->host) >> huge_page_shift(h);
    3822         [ #  # ]:          0 :                 if (idx >= size)
    3823                 :          0 :                         goto out;
    3824                 :            : 
    3825                 :            :                 /*
    3826                 :            :                  * Check for page in userfault range
    3827                 :            :                  */
    3828                 :          0 :                 if (userfaultfd_missing(vma)) {
    3829                 :            :                         u32 hash;
    3830                 :            :                         struct vm_fault vmf = {
    3831                 :            :                                 .vma = vma,
    3832                 :            :                                 .address = haddr,
    3833                 :            :                                 .flags = flags,
    3834                 :            :                                 /*
    3835                 :            :                                  * Hard to debug if it ends up being
    3836                 :            :                                  * used by a callee that assumes
    3837                 :            :                                  * something about the other
    3838                 :            :                                  * uninitialized fields... same as in
    3839                 :            :                                  * memory.c
    3840                 :            :                                  */
    3841                 :            :                         };
    3842                 :            : 
    3843                 :            :                         /*
    3844                 :            :                          * hugetlb_fault_mutex must be dropped before
    3845                 :            :                          * handling userfault.  Reacquire after handling
    3846                 :            :                          * fault to make calling code simpler.
    3847                 :            :                          */
    3848                 :            :                         hash = hugetlb_fault_mutex_hash(mapping, idx);
    3849                 :            :                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    3850                 :            :                         ret = handle_userfault(&vmf, VM_UFFD_MISSING);
    3851                 :            :                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
    3852                 :            :                         goto out;
    3853                 :            :                 }
    3854                 :            : 
    3855                 :          0 :                 page = alloc_huge_page(vma, haddr, 0);
    3856         [ #  # ]:          0 :                 if (IS_ERR(page)) {
    3857                 :            :                         /*
    3858                 :            :                          * Returning error will result in faulting task being
    3859                 :            :                          * sent SIGBUS.  The hugetlb fault mutex prevents two
    3860                 :            :                          * tasks from racing to fault in the same page which
    3861                 :            :                          * could result in false unable to allocate errors.
    3862                 :            :                          * Page migration does not take the fault mutex, but
    3863                 :            :                          * does a clear then write of pte's under page table
    3864                 :            :                          * lock.  Page fault code could race with migration,
    3865                 :            :                          * notice the clear pte and try to allocate a page
    3866                 :            :                          * here.  Before returning error, get ptl and make
    3867                 :            :                          * sure there really is no pte entry.
    3868                 :            :                          */
    3869                 :          0 :                         ptl = huge_pte_lock(h, mm, ptep);
    3870         [ #  # ]:          0 :                         if (!huge_pte_none(huge_ptep_get(ptep))) {
    3871                 :          0 :                                 ret = 0;
    3872                 :          0 :                                 spin_unlock(ptl);
    3873                 :          0 :                                 goto out;
    3874                 :            :                         }
    3875                 :          0 :                         spin_unlock(ptl);
    3876         [ #  # ]:          0 :                         ret = vmf_error(PTR_ERR(page));
    3877                 :          0 :                         goto out;
    3878                 :            :                 }
    3879                 :          0 :                 clear_huge_page(page, address, pages_per_huge_page(h));
    3880                 :          0 :                 __SetPageUptodate(page);
    3881                 :          0 :                 new_page = true;
    3882                 :            : 
    3883         [ #  # ]:          0 :                 if (vma->vm_flags & VM_MAYSHARE) {
    3884                 :          0 :                         int err = huge_add_to_page_cache(page, mapping, idx);
    3885         [ #  # ]:          0 :                         if (err) {
    3886                 :          0 :                                 put_page(page);
    3887         [ #  # ]:          0 :                                 if (err == -EEXIST)
    3888                 :          0 :                                         goto retry;
    3889                 :          0 :                                 goto out;
    3890                 :            :                         }
    3891                 :            :                 } else {
    3892                 :          0 :                         lock_page(page);
    3893   [ #  #  #  # ]:          0 :                         if (unlikely(anon_vma_prepare(vma))) {
    3894                 :          0 :                                 ret = VM_FAULT_OOM;
    3895                 :          0 :                                 goto backout_unlocked;
    3896                 :            :                         }
    3897                 :            :                         anon_rmap = 1;
    3898                 :            :                 }
    3899                 :            :         } else {
    3900                 :            :                 /*
    3901                 :            :                  * If memory error occurs between mmap() and fault, some process
    3902                 :            :                  * don't have hwpoisoned swap entry for errored virtual address.
    3903                 :            :                  * So we need to block hugepage fault by PG_hwpoison bit check.
    3904                 :            :                  */
    3905                 :            :                 if (unlikely(PageHWPoison(page))) {
    3906                 :            :                         ret = VM_FAULT_HWPOISON |
    3907                 :            :                                 VM_FAULT_SET_HINDEX(hstate_index(h));
    3908                 :            :                         goto backout_unlocked;
    3909                 :            :                 }
    3910                 :            :         }
    3911                 :            : 
    3912                 :            :         /*
    3913                 :            :          * If we are going to COW a private mapping later, we examine the
    3914                 :            :          * pending reservations for this page now. This will ensure that
    3915                 :            :          * any allocations necessary to record that reservation occur outside
    3916                 :            :          * the spinlock.
    3917                 :            :          */
    3918   [ #  #  #  # ]:          0 :         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
    3919         [ #  # ]:          0 :                 if (vma_needs_reservation(h, vma, haddr) < 0) {
    3920                 :          0 :                         ret = VM_FAULT_OOM;
    3921                 :          0 :                         goto backout_unlocked;
    3922                 :            :                 }
    3923                 :            :                 /* Just decrements count, does not deallocate */
    3924                 :          0 :                 vma_end_reservation(h, vma, haddr);
    3925                 :            :         }
    3926                 :            : 
    3927                 :          0 :         ptl = huge_pte_lock(h, mm, ptep);
    3928         [ #  # ]:          0 :         size = i_size_read(mapping->host) >> huge_page_shift(h);
    3929         [ #  # ]:          0 :         if (idx >= size)
    3930                 :          0 :                 goto backout;
    3931                 :            : 
    3932                 :          0 :         ret = 0;
    3933         [ #  # ]:          0 :         if (!huge_pte_none(huge_ptep_get(ptep)))
    3934                 :          0 :                 goto backout;
    3935                 :            : 
    3936         [ #  # ]:          0 :         if (anon_rmap) {
    3937                 :          0 :                 ClearPagePrivate(page);
    3938                 :          0 :                 hugepage_add_new_anon_rmap(page, vma, haddr);
    3939                 :            :         } else
    3940                 :          0 :                 page_dup_rmap(page, true);
    3941                 :          0 :         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
    3942                 :          0 :                                 && (vma->vm_flags & VM_SHARED)));
    3943                 :          0 :         set_huge_pte_at(mm, haddr, ptep, new_pte);
    3944                 :            : 
    3945                 :          0 :         hugetlb_count_add(pages_per_huge_page(h), mm);
    3946   [ #  #  #  # ]:          0 :         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
    3947                 :            :                 /* Optimization, do the COW without a second fault */
    3948                 :          0 :                 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
    3949                 :            :         }
    3950                 :            : 
    3951                 :          0 :         spin_unlock(ptl);
    3952                 :            : 
    3953                 :            :         /*
    3954                 :            :          * Only make newly allocated pages active.  Existing pages found
    3955                 :            :          * in the pagecache could be !page_huge_active() if they have been
    3956                 :            :          * isolated for migration.
    3957                 :            :          */
    3958         [ #  # ]:          0 :         if (new_page)
    3959                 :          0 :                 set_page_huge_active(page);
    3960                 :            : 
    3961                 :          0 :         unlock_page(page);
    3962                 :            : out:
    3963                 :            :         return ret;
    3964                 :            : 
    3965                 :          0 : backout:
    3966                 :          0 :         spin_unlock(ptl);
    3967                 :          0 : backout_unlocked:
    3968                 :          0 :         unlock_page(page);
    3969                 :          0 :         restore_reserve_on_error(h, vma, haddr, page);
    3970                 :          0 :         put_page(page);
    3971                 :          0 :         goto out;
    3972                 :            : }
    3973                 :            : 
    3974                 :            : #ifdef CONFIG_SMP
    3975                 :          0 : u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
    3976                 :            : {
    3977                 :          0 :         unsigned long key[2];
    3978                 :          0 :         u32 hash;
    3979                 :            : 
    3980                 :          0 :         key[0] = (unsigned long) mapping;
    3981                 :          0 :         key[1] = idx;
    3982                 :            : 
    3983                 :          0 :         hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
    3984                 :            : 
    3985                 :          0 :         return hash & (num_fault_mutexes - 1);
    3986                 :            : }
    3987                 :            : #else
    3988                 :            : /*
    3989                 :            :  * For uniprocesor systems we always use a single mutex, so just
    3990                 :            :  * return 0 and avoid the hashing overhead.
    3991                 :            :  */
    3992                 :            : u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
    3993                 :            : {
    3994                 :            :         return 0;
    3995                 :            : }
    3996                 :            : #endif
    3997                 :            : 
    3998                 :          0 : vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    3999                 :            :                         unsigned long address, unsigned int flags)
    4000                 :            : {
    4001                 :          0 :         pte_t *ptep, entry;
    4002                 :          0 :         spinlock_t *ptl;
    4003                 :          0 :         vm_fault_t ret;
    4004                 :          0 :         u32 hash;
    4005                 :          0 :         pgoff_t idx;
    4006                 :          0 :         struct page *page = NULL;
    4007                 :          0 :         struct page *pagecache_page = NULL;
    4008                 :          0 :         struct hstate *h = hstate_vma(vma);
    4009                 :          0 :         struct address_space *mapping;
    4010                 :          0 :         int need_wait_lock = 0;
    4011                 :          0 :         unsigned long haddr = address & huge_page_mask(h);
    4012                 :            : 
    4013                 :          0 :         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
    4014         [ #  # ]:          0 :         if (ptep) {
    4015         [ #  # ]:          0 :                 entry = huge_ptep_get(ptep);
    4016         [ #  # ]:          0 :                 if (unlikely(is_hugetlb_entry_migration(entry))) {
    4017                 :          0 :                         migration_entry_wait_huge(vma, mm, ptep);
    4018                 :          0 :                         return 0;
    4019                 :            :                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
    4020                 :            :                         return VM_FAULT_HWPOISON_LARGE |
    4021                 :            :                                 VM_FAULT_SET_HINDEX(hstate_index(h));
    4022                 :            :         } else {
    4023                 :          0 :                 ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
    4024         [ #  # ]:          0 :                 if (!ptep)
    4025                 :            :                         return VM_FAULT_OOM;
    4026                 :            :         }
    4027                 :            : 
    4028                 :          0 :         mapping = vma->vm_file->f_mapping;
    4029                 :          0 :         idx = vma_hugecache_offset(h, vma, haddr);
    4030                 :            : 
    4031                 :            :         /*
    4032                 :            :          * Serialize hugepage allocation and instantiation, so that we don't
    4033                 :            :          * get spurious allocation failures if two CPUs race to instantiate
    4034                 :            :          * the same page in the page cache.
    4035                 :            :          */
    4036                 :          0 :         hash = hugetlb_fault_mutex_hash(mapping, idx);
    4037                 :          0 :         mutex_lock(&hugetlb_fault_mutex_table[hash]);
    4038                 :            : 
    4039         [ #  # ]:          0 :         entry = huge_ptep_get(ptep);
    4040         [ #  # ]:          0 :         if (huge_pte_none(entry)) {
    4041                 :          0 :                 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
    4042                 :          0 :                 goto out_mutex;
    4043                 :            :         }
    4044                 :            : 
    4045                 :          0 :         ret = 0;
    4046                 :            : 
    4047                 :            :         /*
    4048                 :            :          * entry could be a migration/hwpoison entry at this point, so this
    4049                 :            :          * check prevents the kernel from going below assuming that we have
    4050                 :            :          * a active hugepage in pagecache. This goto expects the 2nd page fault,
    4051                 :            :          * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
    4052                 :            :          * handle it.
    4053                 :            :          */
    4054         [ #  # ]:          0 :         if (!pte_present(entry))
    4055                 :          0 :                 goto out_mutex;
    4056                 :            : 
    4057                 :            :         /*
    4058                 :            :          * If we are going to COW the mapping later, we examine the pending
    4059                 :            :          * reservations for this page now. This will ensure that any
    4060                 :            :          * allocations necessary to record that reservation occur outside the
    4061                 :            :          * spinlock. For private mappings, we also lookup the pagecache
    4062                 :            :          * page now as it is used to determine if a reservation has been
    4063                 :            :          * consumed.
    4064                 :            :          */
    4065   [ #  #  #  # ]:          0 :         if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
    4066         [ #  # ]:          0 :                 if (vma_needs_reservation(h, vma, haddr) < 0) {
    4067                 :          0 :                         ret = VM_FAULT_OOM;
    4068                 :          0 :                         goto out_mutex;
    4069                 :            :                 }
    4070                 :            :                 /* Just decrements count, does not deallocate */
    4071                 :          0 :                 vma_end_reservation(h, vma, haddr);
    4072                 :            : 
    4073         [ #  # ]:          0 :                 if (!(vma->vm_flags & VM_MAYSHARE))
    4074                 :          0 :                         pagecache_page = hugetlbfs_pagecache_page(h,
    4075                 :            :                                                                 vma, haddr);
    4076                 :            :         }
    4077                 :            : 
    4078                 :          0 :         ptl = huge_pte_lock(h, mm, ptep);
    4079                 :            : 
    4080                 :            :         /* Check for a racing update before calling hugetlb_cow */
    4081         [ #  # ]:          0 :         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
    4082                 :          0 :                 goto out_ptl;
    4083                 :            : 
    4084                 :            :         /*
    4085                 :            :          * hugetlb_cow() requires page locks of pte_page(entry) and
    4086                 :            :          * pagecache_page, so here we need take the former one
    4087                 :            :          * when page != pagecache_page or !pagecache_page.
    4088                 :            :          */
    4089         [ #  # ]:          0 :         page = pte_page(entry);
    4090         [ #  # ]:          0 :         if (page != pagecache_page)
    4091   [ #  #  #  # ]:          0 :                 if (!trylock_page(page)) {
    4092                 :          0 :                         need_wait_lock = 1;
    4093                 :          0 :                         goto out_ptl;
    4094                 :            :                 }
    4095                 :            : 
    4096         [ #  # ]:          0 :         get_page(page);
    4097                 :            : 
    4098         [ #  # ]:          0 :         if (flags & FAULT_FLAG_WRITE) {
    4099         [ #  # ]:          0 :                 if (!huge_pte_write(entry)) {
    4100                 :          0 :                         ret = hugetlb_cow(mm, vma, address, ptep,
    4101                 :            :                                           pagecache_page, ptl);
    4102                 :          0 :                         goto out_put_page;
    4103                 :            :                 }
    4104                 :          0 :                 entry = huge_pte_mkdirty(entry);
    4105                 :            :         }
    4106                 :          0 :         entry = pte_mkyoung(entry);
    4107                 :          0 :         if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
    4108                 :          0 :                                                 flags & FAULT_FLAG_WRITE))
    4109                 :            :                 update_mmu_cache(vma, haddr, ptep);
    4110                 :          0 : out_put_page:
    4111         [ #  # ]:          0 :         if (page != pagecache_page)
    4112                 :          0 :                 unlock_page(page);
    4113                 :          0 :         put_page(page);
    4114                 :          0 : out_ptl:
    4115                 :          0 :         spin_unlock(ptl);
    4116                 :            : 
    4117         [ #  # ]:          0 :         if (pagecache_page) {
    4118                 :          0 :                 unlock_page(pagecache_page);
    4119                 :          0 :                 put_page(pagecache_page);
    4120                 :            :         }
    4121                 :          0 : out_mutex:
    4122                 :          0 :         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    4123                 :            :         /*
    4124                 :            :          * Generally it's safe to hold refcount during waiting page lock. But
    4125                 :            :          * here we just wait to defer the next page fault to avoid busy loop and
    4126                 :            :          * the page is not used after unlocked before returning from the current
    4127                 :            :          * page fault. So we are safe from accessing freed page, even if we wait
    4128                 :            :          * here without taking refcount.
    4129                 :            :          */
    4130         [ #  # ]:          0 :         if (need_wait_lock)
    4131                 :          0 :                 wait_on_page_locked(page);
    4132                 :            :         return ret;
    4133                 :            : }
    4134                 :            : 
    4135                 :            : /*
    4136                 :            :  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
    4137                 :            :  * modifications for huge pages.
    4138                 :            :  */
    4139                 :          0 : int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
    4140                 :            :                             pte_t *dst_pte,
    4141                 :            :                             struct vm_area_struct *dst_vma,
    4142                 :            :                             unsigned long dst_addr,
    4143                 :            :                             unsigned long src_addr,
    4144                 :            :                             struct page **pagep)
    4145                 :            : {
    4146                 :          0 :         struct address_space *mapping;
    4147                 :          0 :         pgoff_t idx;
    4148                 :          0 :         unsigned long size;
    4149                 :          0 :         int vm_shared = dst_vma->vm_flags & VM_SHARED;
    4150         [ #  # ]:          0 :         struct hstate *h = hstate_vma(dst_vma);
    4151                 :          0 :         pte_t _dst_pte;
    4152                 :          0 :         spinlock_t *ptl;
    4153                 :          0 :         int ret;
    4154                 :          0 :         struct page *page;
    4155                 :            : 
    4156         [ #  # ]:          0 :         if (!*pagep) {
    4157                 :          0 :                 ret = -ENOMEM;
    4158                 :          0 :                 page = alloc_huge_page(dst_vma, dst_addr, 0);
    4159         [ #  # ]:          0 :                 if (IS_ERR(page))
    4160                 :          0 :                         goto out;
    4161                 :            : 
    4162                 :          0 :                 ret = copy_huge_page_from_user(page,
    4163                 :            :                                                 (const void __user *) src_addr,
    4164                 :            :                                                 pages_per_huge_page(h), false);
    4165                 :            : 
    4166                 :            :                 /* fallback to copy_from_user outside mmap_sem */
    4167         [ #  # ]:          0 :                 if (unlikely(ret)) {
    4168                 :          0 :                         ret = -ENOENT;
    4169                 :          0 :                         *pagep = page;
    4170                 :            :                         /* don't free the page */
    4171                 :          0 :                         goto out;
    4172                 :            :                 }
    4173                 :            :         } else {
    4174                 :          0 :                 page = *pagep;
    4175                 :          0 :                 *pagep = NULL;
    4176                 :            :         }
    4177                 :            : 
    4178                 :            :         /*
    4179                 :            :          * The memory barrier inside __SetPageUptodate makes sure that
    4180                 :            :          * preceding stores to the page contents become visible before
    4181                 :            :          * the set_pte_at() write.
    4182                 :            :          */
    4183                 :          0 :         __SetPageUptodate(page);
    4184                 :            : 
    4185                 :          0 :         mapping = dst_vma->vm_file->f_mapping;
    4186         [ #  # ]:          0 :         idx = vma_hugecache_offset(h, dst_vma, dst_addr);
    4187                 :            : 
    4188                 :            :         /*
    4189                 :            :          * If shared, add to page cache
    4190                 :            :          */
    4191         [ #  # ]:          0 :         if (vm_shared) {
    4192         [ #  # ]:          0 :                 size = i_size_read(mapping->host) >> huge_page_shift(h);
    4193                 :          0 :                 ret = -EFAULT;
    4194         [ #  # ]:          0 :                 if (idx >= size)
    4195                 :          0 :                         goto out_release_nounlock;
    4196                 :            : 
    4197                 :            :                 /*
    4198                 :            :                  * Serialization between remove_inode_hugepages() and
    4199                 :            :                  * huge_add_to_page_cache() below happens through the
    4200                 :            :                  * hugetlb_fault_mutex_table that here must be hold by
    4201                 :            :                  * the caller.
    4202                 :            :                  */
    4203                 :          0 :                 ret = huge_add_to_page_cache(page, mapping, idx);
    4204         [ #  # ]:          0 :                 if (ret)
    4205                 :          0 :                         goto out_release_nounlock;
    4206                 :            :         }
    4207                 :            : 
    4208         [ #  # ]:          0 :         ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
    4209                 :          0 :         spin_lock(ptl);
    4210                 :            : 
    4211                 :            :         /*
    4212                 :            :          * Recheck the i_size after holding PT lock to make sure not
    4213                 :            :          * to leave any page mapped (as page_mapped()) beyond the end
    4214                 :            :          * of the i_size (remove_inode_hugepages() is strict about
    4215                 :            :          * enforcing that). If we bail out here, we'll also leave a
    4216                 :            :          * page in the radix tree in the vm_shared case beyond the end
    4217                 :            :          * of the i_size, but remove_inode_hugepages() will take care
    4218                 :            :          * of it as soon as we drop the hugetlb_fault_mutex_table.
    4219                 :            :          */
    4220         [ #  # ]:          0 :         size = i_size_read(mapping->host) >> huge_page_shift(h);
    4221                 :          0 :         ret = -EFAULT;
    4222         [ #  # ]:          0 :         if (idx >= size)
    4223                 :          0 :                 goto out_release_unlock;
    4224                 :            : 
    4225                 :          0 :         ret = -EEXIST;
    4226         [ #  # ]:          0 :         if (!huge_pte_none(huge_ptep_get(dst_pte)))
    4227                 :          0 :                 goto out_release_unlock;
    4228                 :            : 
    4229         [ #  # ]:          0 :         if (vm_shared) {
    4230                 :          0 :                 page_dup_rmap(page, true);
    4231                 :            :         } else {
    4232                 :          0 :                 ClearPagePrivate(page);
    4233                 :          0 :                 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
    4234                 :            :         }
    4235                 :            : 
    4236                 :          0 :         _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
    4237         [ #  # ]:          0 :         if (dst_vma->vm_flags & VM_WRITE)
    4238                 :          0 :                 _dst_pte = huge_pte_mkdirty(_dst_pte);
    4239                 :          0 :         _dst_pte = pte_mkyoung(_dst_pte);
    4240                 :            : 
    4241                 :          0 :         set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
    4242                 :            : 
    4243                 :          0 :         (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
    4244                 :          0 :                                         dst_vma->vm_flags & VM_WRITE);
    4245                 :          0 :         hugetlb_count_add(pages_per_huge_page(h), dst_mm);
    4246                 :            : 
    4247                 :            :         /* No need to invalidate - it was non-present before */
    4248                 :          0 :         update_mmu_cache(dst_vma, dst_addr, dst_pte);
    4249                 :            : 
    4250                 :          0 :         spin_unlock(ptl);
    4251                 :          0 :         set_page_huge_active(page);
    4252         [ #  # ]:          0 :         if (vm_shared)
    4253                 :          0 :                 unlock_page(page);
    4254                 :            :         ret = 0;
    4255                 :          0 : out:
    4256                 :          0 :         return ret;
    4257                 :          0 : out_release_unlock:
    4258                 :          0 :         spin_unlock(ptl);
    4259         [ #  # ]:          0 :         if (vm_shared)
    4260                 :          0 :                 unlock_page(page);
    4261                 :          0 : out_release_nounlock:
    4262                 :          0 :         put_page(page);
    4263                 :          0 :         goto out;
    4264                 :            : }
    4265                 :            : 
    4266                 :          0 : long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
    4267                 :            :                          struct page **pages, struct vm_area_struct **vmas,
    4268                 :            :                          unsigned long *position, unsigned long *nr_pages,
    4269                 :            :                          long i, unsigned int flags, int *nonblocking)
    4270                 :            : {
    4271                 :          0 :         unsigned long pfn_offset;
    4272                 :          0 :         unsigned long vaddr = *position;
    4273                 :          0 :         unsigned long remainder = *nr_pages;
    4274                 :          0 :         struct hstate *h = hstate_vma(vma);
    4275                 :          0 :         int err = -EFAULT;
    4276                 :            : 
    4277   [ #  #  #  # ]:          0 :         while (vaddr < vma->vm_end && remainder) {
    4278                 :          0 :                 pte_t *pte;
    4279                 :          0 :                 spinlock_t *ptl = NULL;
    4280                 :          0 :                 int absent;
    4281                 :          0 :                 struct page *page;
    4282                 :            : 
    4283                 :            :                 /*
    4284                 :            :                  * If we have a pending SIGKILL, don't keep faulting pages and
    4285                 :            :                  * potentially allocating memory.
    4286                 :            :                  */
    4287         [ #  # ]:          0 :                 if (fatal_signal_pending(current)) {
    4288                 :            :                         remainder = 0;
    4289                 :            :                         break;
    4290                 :            :                 }
    4291                 :            : 
    4292                 :            :                 /*
    4293                 :            :                  * Some archs (sparc64, sh*) have multiple pte_ts to
    4294                 :            :                  * each hugepage.  We have to make sure we get the
    4295                 :            :                  * first, for the page indexing below to work.
    4296                 :            :                  *
    4297                 :            :                  * Note that page table lock is not held when pte is null.
    4298                 :            :                  */
    4299                 :          0 :                 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
    4300                 :            :                                       huge_page_size(h));
    4301         [ #  # ]:          0 :                 if (pte)
    4302                 :          0 :                         ptl = huge_pte_lock(h, mm, pte);
    4303   [ #  #  #  # ]:          0 :                 absent = !pte || huge_pte_none(huge_ptep_get(pte));
    4304                 :            : 
    4305                 :            :                 /*
    4306                 :            :                  * When coredumping, it suits get_dump_page if we just return
    4307                 :            :                  * an error where there's an empty slot with no huge pagecache
    4308                 :            :                  * to back it.  This way, we avoid allocating a hugepage, and
    4309                 :            :                  * the sparse dumpfile avoids allocating disk blocks, but its
    4310                 :            :                  * huge holes still show up with zeroes where they need to be.
    4311                 :            :                  */
    4312   [ #  #  #  # ]:          0 :                 if (absent && (flags & FOLL_DUMP) &&
    4313                 :          0 :                     !hugetlbfs_pagecache_present(h, vma, vaddr)) {
    4314         [ #  # ]:          0 :                         if (pte)
    4315                 :          0 :                                 spin_unlock(ptl);
    4316                 :            :                         remainder = 0;
    4317                 :            :                         break;
    4318                 :            :                 }
    4319                 :            : 
    4320                 :            :                 /*
    4321                 :            :                  * We need call hugetlb_fault for both hugepages under migration
    4322                 :            :                  * (in which case hugetlb_fault waits for the migration,) and
    4323                 :            :                  * hwpoisoned hugepages (in which case we need to prevent the
    4324                 :            :                  * caller from accessing to them.) In order to do this, we use
    4325                 :            :                  * here is_swap_pte instead of is_hugetlb_entry_migration and
    4326                 :            :                  * is_hugetlb_entry_hwpoisoned. This is because it simply covers
    4327                 :            :                  * both cases, and because we can't follow correct pages
    4328                 :            :                  * directly from any kind of swap entries.
    4329                 :            :                  */
    4330         [ #  # ]:          0 :                 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
    4331   [ #  #  #  # ]:          0 :                     ((flags & FOLL_WRITE) &&
    4332                 :            :                       !huge_pte_write(huge_ptep_get(pte)))) {
    4333                 :          0 :                         vm_fault_t ret;
    4334                 :          0 :                         unsigned int fault_flags = 0;
    4335                 :            : 
    4336         [ #  # ]:          0 :                         if (pte)
    4337                 :          0 :                                 spin_unlock(ptl);
    4338                 :          0 :                         if (flags & FOLL_WRITE)
    4339                 :            :                                 fault_flags |= FAULT_FLAG_WRITE;
    4340         [ #  # ]:          0 :                         if (nonblocking)
    4341                 :          0 :                                 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
    4342         [ #  # ]:          0 :                         if (flags & FOLL_NOWAIT)
    4343                 :          0 :                                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
    4344                 :            :                                         FAULT_FLAG_RETRY_NOWAIT;
    4345         [ #  # ]:          0 :                         if (flags & FOLL_TRIED) {
    4346                 :          0 :                                 VM_WARN_ON_ONCE(fault_flags &
    4347                 :            :                                                 FAULT_FLAG_ALLOW_RETRY);
    4348                 :          0 :                                 fault_flags |= FAULT_FLAG_TRIED;
    4349                 :            :                         }
    4350                 :          0 :                         ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
    4351         [ #  # ]:          0 :                         if (ret & VM_FAULT_ERROR) {
    4352         [ #  # ]:          0 :                                 err = vm_fault_to_errno(ret, flags);
    4353                 :            :                                 remainder = 0;
    4354                 :            :                                 break;
    4355                 :            :                         }
    4356         [ #  # ]:          0 :                         if (ret & VM_FAULT_RETRY) {
    4357         [ #  # ]:          0 :                                 if (nonblocking &&
    4358         [ #  # ]:          0 :                                     !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
    4359                 :          0 :                                         *nonblocking = 0;
    4360                 :          0 :                                 *nr_pages = 0;
    4361                 :            :                                 /*
    4362                 :            :                                  * VM_FAULT_RETRY must not return an
    4363                 :            :                                  * error, it will return zero
    4364                 :            :                                  * instead.
    4365                 :            :                                  *
    4366                 :            :                                  * No need to update "position" as the
    4367                 :            :                                  * caller will not check it after
    4368                 :            :                                  * *nr_pages is set to 0.
    4369                 :            :                                  */
    4370                 :          0 :                                 return i;
    4371                 :            :                         }
    4372                 :          0 :                         continue;
    4373                 :            :                 }
    4374                 :            : 
    4375         [ #  # ]:          0 :                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
    4376         [ #  # ]:          0 :                 page = pte_page(huge_ptep_get(pte));
    4377                 :            : 
    4378                 :            :                 /*
    4379                 :            :                  * Instead of doing 'try_get_page()' below in the same_page
    4380                 :            :                  * loop, just check the count once here.
    4381                 :            :                  */
    4382   [ #  #  #  # ]:          0 :                 if (unlikely(page_count(page) <= 0)) {
    4383         [ #  # ]:          0 :                         if (pages) {
    4384                 :          0 :                                 spin_unlock(ptl);
    4385                 :          0 :                                 remainder = 0;
    4386                 :          0 :                                 err = -ENOMEM;
    4387                 :          0 :                                 break;
    4388                 :            :                         }
    4389                 :            :                 }
    4390                 :            : 
    4391                 :            :                 /*
    4392                 :            :                  * If subpage information not requested, update counters
    4393                 :            :                  * and skip the same_page loop below.
    4394                 :            :                  */
    4395   [ #  #  #  #  :          0 :                 if (!pages && !vmas && !pfn_offset &&
                   #  # ]
    4396         [ #  # ]:          0 :                     (vaddr + huge_page_size(h) < vma->vm_end) &&
    4397         [ #  # ]:          0 :                     (remainder >= pages_per_huge_page(h))) {
    4398                 :          0 :                         vaddr += huge_page_size(h);
    4399                 :          0 :                         remainder -= pages_per_huge_page(h);
    4400                 :          0 :                         i += pages_per_huge_page(h);
    4401                 :          0 :                         spin_unlock(ptl);
    4402                 :          0 :                         continue;
    4403                 :            :                 }
    4404                 :            : 
    4405                 :          0 : same_page:
    4406         [ #  # ]:          0 :                 if (pages) {
    4407         [ #  # ]:          0 :                         pages[i] = mem_map_offset(page, pfn_offset);
    4408         [ #  # ]:          0 :                         get_page(pages[i]);
    4409                 :            :                 }
    4410                 :            : 
    4411         [ #  # ]:          0 :                 if (vmas)
    4412                 :          0 :                         vmas[i] = vma;
    4413                 :            : 
    4414                 :          0 :                 vaddr += PAGE_SIZE;
    4415                 :          0 :                 ++pfn_offset;
    4416                 :          0 :                 --remainder;
    4417                 :          0 :                 ++i;
    4418   [ #  #  #  #  :          0 :                 if (vaddr < vma->vm_end && remainder &&
                   #  # ]
    4419         [ #  # ]:          0 :                                 pfn_offset < pages_per_huge_page(h)) {
    4420                 :            :                         /*
    4421                 :            :                          * We use pfn_offset to avoid touching the pageframes
    4422                 :            :                          * of this compound page.
    4423                 :            :                          */
    4424                 :          0 :                         goto same_page;
    4425                 :            :                 }
    4426                 :          0 :                 spin_unlock(ptl);
    4427                 :            :         }
    4428                 :          0 :         *nr_pages = remainder;
    4429                 :            :         /*
    4430                 :            :          * setting position is actually required only if remainder is
    4431                 :            :          * not zero but it's faster not to add a "if (remainder)"
    4432                 :            :          * branch.
    4433                 :            :          */
    4434                 :          0 :         *position = vaddr;
    4435                 :            : 
    4436         [ #  # ]:          0 :         return i ? i : err;
    4437                 :            : }
    4438                 :            : 
    4439                 :            : #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
    4440                 :            : /*
    4441                 :            :  * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
    4442                 :            :  * implement this.
    4443                 :            :  */
    4444                 :            : #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
    4445                 :            : #endif
    4446                 :            : 
    4447                 :          0 : unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
    4448                 :            :                 unsigned long address, unsigned long end, pgprot_t newprot)
    4449                 :            : {
    4450                 :          0 :         struct mm_struct *mm = vma->vm_mm;
    4451                 :          0 :         unsigned long start = address;
    4452                 :          0 :         pte_t *ptep;
    4453                 :          0 :         pte_t pte;
    4454                 :          0 :         struct hstate *h = hstate_vma(vma);
    4455                 :          0 :         unsigned long pages = 0;
    4456                 :          0 :         bool shared_pmd = false;
    4457                 :          0 :         struct mmu_notifier_range range;
    4458                 :            : 
    4459                 :            :         /*
    4460                 :            :          * In the case of shared PMDs, the area to flush could be beyond
    4461                 :            :          * start/end.  Set range.start/range.end to cover the maximum possible
    4462                 :            :          * range if PMD sharing is possible.
    4463                 :            :          */
    4464                 :          0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
    4465                 :            :                                 0, vma, mm, start, end);
    4466                 :          0 :         adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
    4467                 :            : 
    4468         [ #  # ]:          0 :         BUG_ON(address >= end);
    4469                 :          0 :         flush_cache_range(vma, range.start, range.end);
    4470                 :            : 
    4471                 :          0 :         mmu_notifier_invalidate_range_start(&range);
    4472                 :          0 :         i_mmap_lock_write(vma->vm_file->f_mapping);
    4473         [ #  # ]:          0 :         for (; address < end; address += huge_page_size(h)) {
    4474                 :          0 :                 spinlock_t *ptl;
    4475                 :          0 :                 ptep = huge_pte_offset(mm, address, huge_page_size(h));
    4476         [ #  # ]:          0 :                 if (!ptep)
    4477                 :          0 :                         continue;
    4478                 :          0 :                 ptl = huge_pte_lock(h, mm, ptep);
    4479         [ #  # ]:          0 :                 if (huge_pmd_unshare(mm, &address, ptep)) {
    4480                 :          0 :                         pages++;
    4481                 :          0 :                         spin_unlock(ptl);
    4482                 :          0 :                         shared_pmd = true;
    4483                 :          0 :                         continue;
    4484                 :            :                 }
    4485         [ #  # ]:          0 :                 pte = huge_ptep_get(ptep);
    4486         [ #  # ]:          0 :                 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
    4487                 :            :                         spin_unlock(ptl);
    4488                 :            :                         continue;
    4489                 :            :                 }
    4490         [ #  # ]:          0 :                 if (unlikely(is_hugetlb_entry_migration(pte))) {
    4491         [ #  # ]:          0 :                         swp_entry_t entry = pte_to_swp_entry(pte);
    4492                 :            : 
    4493         [ #  # ]:          0 :                         if (is_write_migration_entry(entry)) {
    4494                 :          0 :                                 pte_t newpte;
    4495                 :            : 
    4496                 :          0 :                                 make_migration_entry_read(&entry);
    4497                 :          0 :                                 newpte = swp_entry_to_pte(entry);
    4498                 :          0 :                                 set_huge_swap_pte_at(mm, address, ptep,
    4499                 :            :                                                      newpte, huge_page_size(h));
    4500                 :          0 :                                 pages++;
    4501                 :            :                         }
    4502                 :          0 :                         spin_unlock(ptl);
    4503                 :          0 :                         continue;
    4504                 :            :                 }
    4505         [ #  # ]:          0 :                 if (!huge_pte_none(pte)) {
    4506                 :          0 :                         pte_t old_pte;
    4507                 :            : 
    4508                 :          0 :                         old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
    4509                 :          0 :                         pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
    4510                 :          0 :                         pte = arch_make_huge_pte(pte, vma, NULL, 0);
    4511                 :          0 :                         huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
    4512                 :          0 :                         pages++;
    4513                 :            :                 }
    4514                 :          0 :                 spin_unlock(ptl);
    4515                 :            :         }
    4516                 :            :         /*
    4517                 :            :          * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
    4518                 :            :          * may have cleared our pud entry and done put_page on the page table:
    4519                 :            :          * once we release i_mmap_rwsem, another task can do the final put_page
    4520                 :            :          * and that page table be reused and filled with junk.  If we actually
    4521                 :            :          * did unshare a page of pmds, flush the range corresponding to the pud.
    4522                 :            :          */
    4523         [ #  # ]:          0 :         if (shared_pmd)
    4524         [ #  # ]:          0 :                 flush_hugetlb_tlb_range(vma, range.start, range.end);
    4525                 :            :         else
    4526         [ #  # ]:          0 :                 flush_hugetlb_tlb_range(vma, start, end);
    4527                 :            :         /*
    4528                 :            :          * No need to call mmu_notifier_invalidate_range() we are downgrading
    4529                 :            :          * page table protection not changing it to point to a new page.
    4530                 :            :          *
    4531                 :            :          * See Documentation/vm/mmu_notifier.rst
    4532                 :            :          */
    4533                 :          0 :         i_mmap_unlock_write(vma->vm_file->f_mapping);
    4534                 :          0 :         mmu_notifier_invalidate_range_end(&range);
    4535                 :            : 
    4536                 :          0 :         return pages << h->order;
    4537                 :            : }
    4538                 :            : 
    4539                 :          0 : int hugetlb_reserve_pages(struct inode *inode,
    4540                 :            :                                         long from, long to,
    4541                 :            :                                         struct vm_area_struct *vma,
    4542                 :            :                                         vm_flags_t vm_flags)
    4543                 :            : {
    4544                 :          0 :         long ret, chg;
    4545         [ #  # ]:          0 :         struct hstate *h = hstate_inode(inode);
    4546                 :          0 :         struct hugepage_subpool *spool = subpool_inode(inode);
    4547                 :          0 :         struct resv_map *resv_map;
    4548                 :          0 :         long gbl_reserve;
    4549                 :            : 
    4550                 :            :         /* This should never happen */
    4551         [ #  # ]:          0 :         if (from > to) {
    4552                 :            :                 VM_WARN(1, "%s called with a negative range\n", __func__);
    4553                 :            :                 return -EINVAL;
    4554                 :            :         }
    4555                 :            : 
    4556                 :            :         /*
    4557                 :            :          * Only apply hugepage reservation if asked. At fault time, an
    4558                 :            :          * attempt will be made for VM_NORESERVE to allocate a page
    4559                 :            :          * without using reserves
    4560                 :            :          */
    4561         [ #  # ]:          0 :         if (vm_flags & VM_NORESERVE)
    4562                 :            :                 return 0;
    4563                 :            : 
    4564                 :            :         /*
    4565                 :            :          * Shared mappings base their reservation on the number of pages that
    4566                 :            :          * are already allocated on behalf of the file. Private mappings need
    4567                 :            :          * to reserve the full area even if read-only as mprotect() may be
    4568                 :            :          * called to make the mapping read-write. Assume !vma is a shm mapping
    4569                 :            :          */
    4570   [ #  #  #  # ]:          0 :         if (!vma || vma->vm_flags & VM_MAYSHARE) {
    4571                 :            :                 /*
    4572                 :            :                  * resv_map can not be NULL as hugetlb_reserve_pages is only
    4573                 :            :                  * called for inodes for which resv_maps were created (see
    4574                 :            :                  * hugetlbfs_get_inode).
    4575                 :            :                  */
    4576                 :          0 :                 resv_map = inode_resv_map(inode);
    4577                 :            : 
    4578                 :          0 :                 chg = region_chg(resv_map, from, to);
    4579                 :            : 
    4580                 :            :         } else {
    4581                 :          0 :                 resv_map = resv_map_alloc();
    4582         [ #  # ]:          0 :                 if (!resv_map)
    4583                 :            :                         return -ENOMEM;
    4584                 :            : 
    4585                 :          0 :                 chg = to - from;
    4586                 :            : 
    4587                 :          0 :                 set_vma_resv_map(vma, resv_map);
    4588                 :          0 :                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
    4589                 :            :         }
    4590                 :            : 
    4591         [ #  # ]:          0 :         if (chg < 0) {
    4592                 :          0 :                 ret = chg;
    4593                 :          0 :                 goto out_err;
    4594                 :            :         }
    4595                 :            : 
    4596                 :            :         /*
    4597                 :            :          * There must be enough pages in the subpool for the mapping. If
    4598                 :            :          * the subpool has a minimum size, there may be some global
    4599                 :            :          * reservations already in place (gbl_reserve).
    4600                 :            :          */
    4601                 :          0 :         gbl_reserve = hugepage_subpool_get_pages(spool, chg);
    4602         [ #  # ]:          0 :         if (gbl_reserve < 0) {
    4603                 :          0 :                 ret = -ENOSPC;
    4604                 :          0 :                 goto out_err;
    4605                 :            :         }
    4606                 :            : 
    4607                 :            :         /*
    4608                 :            :          * Check enough hugepages are available for the reservation.
    4609                 :            :          * Hand the pages back to the subpool if there are not
    4610                 :            :          */
    4611                 :          0 :         ret = hugetlb_acct_memory(h, gbl_reserve);
    4612         [ #  # ]:          0 :         if (ret < 0) {
    4613                 :            :                 /* put back original number of pages, chg */
    4614                 :          0 :                 (void)hugepage_subpool_put_pages(spool, chg);
    4615                 :          0 :                 goto out_err;
    4616                 :            :         }
    4617                 :            : 
    4618                 :            :         /*
    4619                 :            :          * Account for the reservations made. Shared mappings record regions
    4620                 :            :          * that have reservations as they are shared by multiple VMAs.
    4621                 :            :          * When the last VMA disappears, the region map says how much
    4622                 :            :          * the reservation was and the page cache tells how much of
    4623                 :            :          * the reservation was consumed. Private mappings are per-VMA and
    4624                 :            :          * only the consumed reservations are tracked. When the VMA
    4625                 :            :          * disappears, the original reservation is the VMA size and the
    4626                 :            :          * consumed reservations are stored in the map. Hence, nothing
    4627                 :            :          * else has to be done for private mappings here
    4628                 :            :          */
    4629   [ #  #  #  # ]:          0 :         if (!vma || vma->vm_flags & VM_MAYSHARE) {
    4630                 :          0 :                 long add = region_add(resv_map, from, to);
    4631                 :            : 
    4632         [ #  # ]:          0 :                 if (unlikely(chg > add)) {
    4633                 :            :                         /*
    4634                 :            :                          * pages in this range were added to the reserve
    4635                 :            :                          * map between region_chg and region_add.  This
    4636                 :            :                          * indicates a race with alloc_huge_page.  Adjust
    4637                 :            :                          * the subpool and reserve counts modified above
    4638                 :            :                          * based on the difference.
    4639                 :            :                          */
    4640                 :          0 :                         long rsv_adjust;
    4641                 :            : 
    4642                 :          0 :                         rsv_adjust = hugepage_subpool_put_pages(spool,
    4643                 :            :                                                                 chg - add);
    4644                 :          0 :                         hugetlb_acct_memory(h, -rsv_adjust);
    4645                 :            :                 }
    4646                 :            :         }
    4647                 :            :         return 0;
    4648                 :          0 : out_err:
    4649   [ #  #  #  # ]:          0 :         if (!vma || vma->vm_flags & VM_MAYSHARE)
    4650                 :            :                 /* Don't call region_abort if region_chg failed */
    4651         [ #  # ]:          0 :                 if (chg >= 0)
    4652                 :          0 :                         region_abort(resv_map, from, to);
    4653   [ #  #  #  # ]:          0 :         if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
    4654                 :          0 :                 kref_put(&resv_map->refs, resv_map_release);
    4655                 :          0 :         return ret;
    4656                 :            : }
    4657                 :            : 
    4658                 :          0 : long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
    4659                 :            :                                                                 long freed)
    4660                 :            : {
    4661         [ #  # ]:          0 :         struct hstate *h = hstate_inode(inode);
    4662                 :          0 :         struct resv_map *resv_map = inode_resv_map(inode);
    4663                 :          0 :         long chg = 0;
    4664                 :          0 :         struct hugepage_subpool *spool = subpool_inode(inode);
    4665                 :          0 :         long gbl_reserve;
    4666                 :            : 
    4667                 :            :         /*
    4668                 :            :          * Since this routine can be called in the evict inode path for all
    4669                 :            :          * hugetlbfs inodes, resv_map could be NULL.
    4670                 :            :          */
    4671         [ #  # ]:          0 :         if (resv_map) {
    4672                 :          0 :                 chg = region_del(resv_map, start, end);
    4673                 :            :                 /*
    4674                 :            :                  * region_del() can fail in the rare case where a region
    4675                 :            :                  * must be split and another region descriptor can not be
    4676                 :            :                  * allocated.  If end == LONG_MAX, it will not fail.
    4677                 :            :                  */
    4678         [ #  # ]:          0 :                 if (chg < 0)
    4679                 :            :                         return chg;
    4680                 :            :         }
    4681                 :            : 
    4682                 :          0 :         spin_lock(&inode->i_lock);
    4683                 :          0 :         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
    4684                 :          0 :         spin_unlock(&inode->i_lock);
    4685                 :            : 
    4686                 :            :         /*
    4687                 :            :          * If the subpool has a minimum size, the number of global
    4688                 :            :          * reservations to be released may be adjusted.
    4689                 :            :          */
    4690                 :          0 :         gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
    4691                 :          0 :         hugetlb_acct_memory(h, -gbl_reserve);
    4692                 :            : 
    4693                 :          0 :         return 0;
    4694                 :            : }
    4695                 :            : 
    4696                 :            : #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
    4697                 :            : static unsigned long page_table_shareable(struct vm_area_struct *svma,
    4698                 :            :                                 struct vm_area_struct *vma,
    4699                 :            :                                 unsigned long addr, pgoff_t idx)
    4700                 :            : {
    4701                 :            :         unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
    4702                 :            :                                 svma->vm_start;
    4703                 :            :         unsigned long sbase = saddr & PUD_MASK;
    4704                 :            :         unsigned long s_end = sbase + PUD_SIZE;
    4705                 :            : 
    4706                 :            :         /* Allow segments to share if only one is marked locked */
    4707                 :            :         unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
    4708                 :            :         unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
    4709                 :            : 
    4710                 :            :         /*
    4711                 :            :          * match the virtual addresses, permission and the alignment of the
    4712                 :            :          * page table page.
    4713                 :            :          */
    4714                 :            :         if (pmd_index(addr) != pmd_index(saddr) ||
    4715                 :            :             vm_flags != svm_flags ||
    4716                 :            :             sbase < svma->vm_start || svma->vm_end < s_end)
    4717                 :            :                 return 0;
    4718                 :            : 
    4719                 :            :         return saddr;
    4720                 :            : }
    4721                 :            : 
    4722                 :          0 : static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
    4723                 :            : {
    4724                 :          0 :         unsigned long base = addr & PUD_MASK;
    4725                 :          0 :         unsigned long end = base + PUD_SIZE;
    4726                 :            : 
    4727                 :            :         /*
    4728                 :            :          * check on proper vm_flags and page table alignment
    4729                 :            :          */
    4730         [ #  # ]:          0 :         if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
    4731                 :          0 :                 return true;
    4732                 :            :         return false;
    4733                 :            : }
    4734                 :            : 
    4735                 :            : /*
    4736                 :            :  * Determine if start,end range within vma could be mapped by shared pmd.
    4737                 :            :  * If yes, adjust start and end to cover range associated with possible
    4738                 :            :  * shared pmd mappings.
    4739                 :            :  */
    4740                 :          0 : void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
    4741                 :            :                                 unsigned long *start, unsigned long *end)
    4742                 :            : {
    4743                 :          0 :         unsigned long check_addr = *start;
    4744                 :            : 
    4745         [ #  # ]:          0 :         if (!(vma->vm_flags & VM_MAYSHARE))
    4746                 :            :                 return;
    4747                 :            : 
    4748         [ #  # ]:          0 :         for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
    4749                 :          0 :                 unsigned long a_start = check_addr & PUD_MASK;
    4750                 :          0 :                 unsigned long a_end = a_start + PUD_SIZE;
    4751                 :            : 
    4752                 :            :                 /*
    4753                 :            :                  * If sharing is possible, adjust start/end if necessary.
    4754                 :            :                  */
    4755   [ #  #  #  # ]:          0 :                 if (range_in_vma(vma, a_start, a_end)) {
    4756         [ #  # ]:          0 :                         if (a_start < *start)
    4757                 :          0 :                                 *start = a_start;
    4758         [ #  # ]:          0 :                         if (a_end > *end)
    4759                 :          0 :                                 *end = a_end;
    4760                 :            :                 }
    4761                 :            :         }
    4762                 :            : }
    4763                 :            : 
    4764                 :            : /*
    4765                 :            :  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
    4766                 :            :  * and returns the corresponding pte. While this is not necessary for the
    4767                 :            :  * !shared pmd case because we can allocate the pmd later as well, it makes the
    4768                 :            :  * code much cleaner. pmd allocation is essential for the shared case because
    4769                 :            :  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
    4770                 :            :  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
    4771                 :            :  * bad pmd for sharing.
    4772                 :            :  */
    4773                 :          0 : pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
    4774                 :            : {
    4775                 :          0 :         struct vm_area_struct *vma = find_vma(mm, addr);
    4776                 :          0 :         struct address_space *mapping = vma->vm_file->f_mapping;
    4777                 :          0 :         pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
    4778                 :          0 :                         vma->vm_pgoff;
    4779                 :          0 :         struct vm_area_struct *svma;
    4780                 :          0 :         unsigned long saddr;
    4781                 :          0 :         pte_t *spte = NULL;
    4782                 :          0 :         pte_t *pte;
    4783                 :          0 :         spinlock_t *ptl;
    4784                 :            : 
    4785         [ #  # ]:          0 :         if (!vma_shareable(vma, addr))
    4786                 :          0 :                 return (pte_t *)pmd_alloc(mm, pud, addr);
    4787                 :            : 
    4788                 :          0 :         i_mmap_lock_read(mapping);
    4789         [ #  # ]:          0 :         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
    4790         [ #  # ]:          0 :                 if (svma == vma)
    4791                 :          0 :                         continue;
    4792                 :            : 
    4793                 :          0 :                 saddr = page_table_shareable(svma, vma, addr, idx);
    4794         [ #  # ]:          0 :                 if (saddr) {
    4795                 :          0 :                         spte = huge_pte_offset(svma->vm_mm, saddr,
    4796                 :            :                                                vma_mmu_pagesize(svma));
    4797         [ #  # ]:          0 :                         if (spte) {
    4798   [ #  #  #  # ]:          0 :                                 get_page(virt_to_page(spte));
    4799                 :            :                                 break;
    4800                 :            :                         }
    4801                 :            :                 }
    4802                 :            :         }
    4803                 :            : 
    4804         [ #  # ]:          0 :         if (!spte)
    4805                 :          0 :                 goto out;
    4806                 :            : 
    4807                 :          0 :         ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
    4808         [ #  # ]:          0 :         if (pud_none(*pud)) {
    4809                 :          0 :                 pud_populate(mm, pud,
    4810         [ #  # ]:          0 :                                 (pmd_t *)((unsigned long)spte & PAGE_MASK));
    4811                 :          0 :                 mm_inc_nr_pmds(mm);
    4812                 :            :         } else {
    4813         [ #  # ]:          0 :                 put_page(virt_to_page(spte));
    4814                 :            :         }
    4815                 :          0 :         spin_unlock(ptl);
    4816                 :          0 : out:
    4817                 :          0 :         pte = (pte_t *)pmd_alloc(mm, pud, addr);
    4818                 :          0 :         i_mmap_unlock_read(mapping);
    4819                 :          0 :         return pte;
    4820                 :            : }
    4821                 :            : 
    4822                 :            : /*
    4823                 :            :  * unmap huge page backed by shared pte.
    4824                 :            :  *
    4825                 :            :  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
    4826                 :            :  * indicated by page_count > 1, unmap is achieved by clearing pud and
    4827                 :            :  * decrementing the ref count. If count == 1, the pte page is not shared.
    4828                 :            :  *
    4829                 :            :  * called with page table lock held.
    4830                 :            :  *
    4831                 :            :  * returns: 1 successfully unmapped a shared pte page
    4832                 :            :  *          0 the underlying pte page is not shared, or it is the last user
    4833                 :            :  */
    4834                 :          0 : int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
    4835                 :            : {
    4836                 :          0 :         pgd_t *pgd = pgd_offset(mm, *addr);
    4837                 :          0 :         p4d_t *p4d = p4d_offset(pgd, *addr);
    4838         [ #  # ]:          0 :         pud_t *pud = pud_offset(p4d, *addr);
    4839                 :            : 
    4840   [ #  #  #  #  :          0 :         BUG_ON(page_count(virt_to_page(ptep)) == 0);
                   #  # ]
    4841   [ #  #  #  #  :          0 :         if (page_count(virt_to_page(ptep)) == 1)
                   #  # ]
    4842                 :            :                 return 0;
    4843                 :            : 
    4844         [ #  # ]:          0 :         pud_clear(pud);
    4845         [ #  # ]:          0 :         put_page(virt_to_page(ptep));
    4846                 :          0 :         mm_dec_nr_pmds(mm);
    4847                 :          0 :         *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
    4848                 :          0 :         return 1;
    4849                 :            : }
    4850                 :            : #define want_pmd_share()        (1)
    4851                 :            : #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
    4852                 :            : pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
    4853                 :            : {
    4854                 :            :         return NULL;
    4855                 :            : }
    4856                 :            : 
    4857                 :            : int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
    4858                 :            : {
    4859                 :            :         return 0;
    4860                 :            : }
    4861                 :            : 
    4862                 :            : void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
    4863                 :            :                                 unsigned long *start, unsigned long *end)
    4864                 :            : {
    4865                 :            : }
    4866                 :            : #define want_pmd_share()        (0)
    4867                 :            : #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
    4868                 :            : 
    4869                 :            : #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
    4870                 :          0 : pte_t *huge_pte_alloc(struct mm_struct *mm,
    4871                 :            :                         unsigned long addr, unsigned long sz)
    4872                 :            : {
    4873                 :          0 :         pgd_t *pgd;
    4874                 :          0 :         p4d_t *p4d;
    4875                 :          0 :         pud_t *pud;
    4876                 :          0 :         pte_t *pte = NULL;
    4877                 :            : 
    4878                 :          0 :         pgd = pgd_offset(mm, addr);
    4879                 :          0 :         p4d = p4d_alloc(mm, pgd, addr);
    4880         [ #  # ]:          0 :         if (!p4d)
    4881                 :            :                 return NULL;
    4882                 :          0 :         pud = pud_alloc(mm, p4d, addr);
    4883         [ #  # ]:          0 :         if (pud) {
    4884         [ #  # ]:          0 :                 if (sz == PUD_SIZE) {
    4885                 :            :                         pte = (pte_t *)pud;
    4886                 :            :                 } else {
    4887         [ #  # ]:          0 :                         BUG_ON(sz != PMD_SIZE);
    4888         [ #  # ]:          0 :                         if (want_pmd_share() && pud_none(*pud))
    4889                 :          0 :                                 pte = huge_pmd_share(mm, addr, pud);
    4890                 :            :                         else
    4891                 :          0 :                                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
    4892                 :            :                 }
    4893                 :            :         }
    4894   [ #  #  #  #  :          0 :         BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
                   #  # ]
    4895                 :            : 
    4896                 :            :         return pte;
    4897                 :            : }
    4898                 :            : 
    4899                 :            : /*
    4900                 :            :  * huge_pte_offset() - Walk the page table to resolve the hugepage
    4901                 :            :  * entry at address @addr
    4902                 :            :  *
    4903                 :            :  * Return: Pointer to page table or swap entry (PUD or PMD) for
    4904                 :            :  * address @addr, or NULL if a p*d_none() entry is encountered and the
    4905                 :            :  * size @sz doesn't match the hugepage size at this level of the page
    4906                 :            :  * table.
    4907                 :            :  */
    4908                 :          0 : pte_t *huge_pte_offset(struct mm_struct *mm,
    4909                 :            :                        unsigned long addr, unsigned long sz)
    4910                 :            : {
    4911                 :          0 :         pgd_t *pgd;
    4912                 :          0 :         p4d_t *p4d;
    4913                 :          0 :         pud_t *pud;
    4914                 :          0 :         pmd_t *pmd;
    4915                 :            : 
    4916                 :          0 :         pgd = pgd_offset(mm, addr);
    4917         [ #  # ]:          0 :         if (!pgd_present(*pgd))
    4918                 :            :                 return NULL;
    4919                 :          0 :         p4d = p4d_offset(pgd, addr);
    4920         [ #  # ]:          0 :         if (!p4d_present(*p4d))
    4921                 :            :                 return NULL;
    4922                 :            : 
    4923         [ #  # ]:          0 :         pud = pud_offset(p4d, addr);
    4924   [ #  #  #  # ]:          0 :         if (sz != PUD_SIZE && pud_none(*pud))
    4925                 :            :                 return NULL;
    4926                 :            :         /* hugepage or swap? */
    4927   [ #  #  #  # ]:          0 :         if (pud_huge(*pud) || !pud_present(*pud))
    4928                 :            :                 return (pte_t *)pud;
    4929                 :            : 
    4930         [ #  # ]:          0 :         pmd = pmd_offset(pud, addr);
    4931   [ #  #  #  # ]:          0 :         if (sz != PMD_SIZE && pmd_none(*pmd))
    4932                 :            :                 return NULL;
    4933                 :            :         /* hugepage or swap? */
    4934   [ #  #  #  # ]:          0 :         if (pmd_huge(*pmd) || !pmd_present(*pmd))
    4935                 :          0 :                 return (pte_t *)pmd;
    4936                 :            : 
    4937                 :            :         return NULL;
    4938                 :            : }
    4939                 :            : 
    4940                 :            : #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
    4941                 :            : 
    4942                 :            : /*
    4943                 :            :  * These functions are overwritable if your architecture needs its own
    4944                 :            :  * behavior.
    4945                 :            :  */
    4946                 :            : struct page * __weak
    4947                 :     276504 : follow_huge_addr(struct mm_struct *mm, unsigned long address,
    4948                 :            :                               int write)
    4949                 :            : {
    4950                 :     276504 :         return ERR_PTR(-EINVAL);
    4951                 :            : }
    4952                 :            : 
    4953                 :            : struct page * __weak
    4954                 :          0 : follow_huge_pd(struct vm_area_struct *vma,
    4955                 :            :                unsigned long address, hugepd_t hpd, int flags, int pdshift)
    4956                 :            : {
    4957                 :          0 :         WARN(1, "hugepd follow called with no support for hugepage directory format\n");
    4958                 :          0 :         return NULL;
    4959                 :            : }
    4960                 :            : 
    4961                 :            : struct page * __weak
    4962                 :          0 : follow_huge_pmd(struct mm_struct *mm, unsigned long address,
    4963                 :            :                 pmd_t *pmd, int flags)
    4964                 :            : {
    4965                 :          0 :         struct page *page = NULL;
    4966                 :          0 :         spinlock_t *ptl;
    4967                 :          0 :         pte_t pte;
    4968                 :          0 : retry:
    4969         [ #  # ]:          0 :         ptl = pmd_lockptr(mm, pmd);
    4970                 :          0 :         spin_lock(ptl);
    4971                 :            :         /*
    4972                 :            :          * make sure that the address range covered by this pmd is not
    4973                 :            :          * unmapped from other threads.
    4974                 :            :          */
    4975         [ #  # ]:          0 :         if (!pmd_huge(*pmd))
    4976                 :          0 :                 goto out;
    4977         [ #  # ]:          0 :         pte = huge_ptep_get((pte_t *)pmd);
    4978         [ #  # ]:          0 :         if (pte_present(pte)) {
    4979         [ #  # ]:          0 :                 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
    4980         [ #  # ]:          0 :                 if (flags & FOLL_GET)
    4981         [ #  # ]:          0 :                         get_page(page);
    4982                 :            :         } else {
    4983         [ #  # ]:          0 :                 if (is_hugetlb_entry_migration(pte)) {
    4984                 :          0 :                         spin_unlock(ptl);
    4985                 :          0 :                         __migration_entry_wait(mm, (pte_t *)pmd, ptl);
    4986                 :          0 :                         goto retry;
    4987                 :            :                 }
    4988                 :            :                 /*
    4989                 :            :                  * hwpoisoned entry is treated as no_page_table in
    4990                 :            :                  * follow_page_mask().
    4991                 :            :                  */
    4992                 :            :         }
    4993                 :          0 : out:
    4994                 :          0 :         spin_unlock(ptl);
    4995                 :          0 :         return page;
    4996                 :            : }
    4997                 :            : 
    4998                 :            : struct page * __weak
    4999                 :          0 : follow_huge_pud(struct mm_struct *mm, unsigned long address,
    5000                 :            :                 pud_t *pud, int flags)
    5001                 :            : {
    5002         [ #  # ]:          0 :         if (flags & FOLL_GET)
    5003                 :            :                 return NULL;
    5004                 :            : 
    5005         [ #  # ]:          0 :         return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
    5006                 :            : }
    5007                 :            : 
    5008                 :            : struct page * __weak
    5009                 :          0 : follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
    5010                 :            : {
    5011         [ #  # ]:          0 :         if (flags & FOLL_GET)
    5012                 :            :                 return NULL;
    5013                 :            : 
    5014         [ #  # ]:          0 :         return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
    5015                 :            : }
    5016                 :            : 
    5017                 :          0 : bool isolate_huge_page(struct page *page, struct list_head *list)
    5018                 :            : {
    5019                 :          0 :         bool ret = true;
    5020                 :            : 
    5021                 :          0 :         VM_BUG_ON_PAGE(!PageHead(page), page);
    5022                 :          0 :         spin_lock(&hugetlb_lock);
    5023   [ #  #  #  # ]:          0 :         if (!page_huge_active(page) || !get_page_unless_zero(page)) {
    5024                 :          0 :                 ret = false;
    5025                 :          0 :                 goto unlock;
    5026                 :            :         }
    5027                 :          0 :         clear_page_huge_active(page);
    5028                 :          0 :         list_move_tail(&page->lru, list);
    5029                 :          0 : unlock:
    5030                 :          0 :         spin_unlock(&hugetlb_lock);
    5031                 :          0 :         return ret;
    5032                 :            : }
    5033                 :            : 
    5034                 :          0 : void putback_active_hugepage(struct page *page)
    5035                 :            : {
    5036                 :          0 :         VM_BUG_ON_PAGE(!PageHead(page), page);
    5037                 :          0 :         spin_lock(&hugetlb_lock);
    5038                 :          0 :         set_page_huge_active(page);
    5039                 :          0 :         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
    5040                 :          0 :         spin_unlock(&hugetlb_lock);
    5041                 :          0 :         put_page(page);
    5042                 :          0 : }
    5043                 :            : 
    5044                 :          0 : void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
    5045                 :            : {
    5046                 :          0 :         struct hstate *h = page_hstate(oldpage);
    5047                 :            : 
    5048                 :          0 :         hugetlb_cgroup_migrate(oldpage, newpage);
    5049                 :          0 :         set_page_owner_migrate_reason(newpage, reason);
    5050                 :            : 
    5051                 :            :         /*
    5052                 :            :          * transfer temporary state of the new huge page. This is
    5053                 :            :          * reverse to other transitions because the newpage is going to
    5054                 :            :          * be final while the old one will be freed so it takes over
    5055                 :            :          * the temporary status.
    5056                 :            :          *
    5057                 :            :          * Also note that we have to transfer the per-node surplus state
    5058                 :            :          * here as well otherwise the global surplus count will not match
    5059                 :            :          * the per-node's.
    5060                 :            :          */
    5061         [ #  # ]:          0 :         if (PageHugeTemporary(newpage)) {
    5062                 :          0 :                 int old_nid = page_to_nid(oldpage);
    5063                 :          0 :                 int new_nid = page_to_nid(newpage);
    5064                 :            : 
    5065                 :          0 :                 SetPageHugeTemporary(oldpage);
    5066                 :          0 :                 ClearPageHugeTemporary(newpage);
    5067                 :            : 
    5068                 :          0 :                 spin_lock(&hugetlb_lock);
    5069         [ #  # ]:          0 :                 if (h->surplus_huge_pages_node[old_nid]) {
    5070                 :          0 :                         h->surplus_huge_pages_node[old_nid]--;
    5071                 :          0 :                         h->surplus_huge_pages_node[new_nid]++;
    5072                 :            :                 }
    5073                 :          0 :                 spin_unlock(&hugetlb_lock);
    5074                 :            :         }
    5075                 :          0 : }

Generated by: LCOV version 1.14