LCOV - Real - mm/filemap.c

LCOV - code coverage report

Current view:	top level - mm - filemap.c (source / functions)		Hit	Total	Coverage
Test:	Real	Lines:	605	954	63.4 %
Date:	2020-10-17 15:46:43	Functions:	8	78	10.3 %
Legend:	Neither, QEMU, Real, Both	Branches:	0	0	-

           Branch data     Line data    Source code

       1                 :            : // SPDX-License-Identifier: GPL-2.0-only
       2                 :            : /*
       3                 :            :  *      linux/mm/filemap.c
       4                 :            :  *
       5                 :            :  * Copyright (C) 1994-1999  Linus Torvalds
       6                 :            :  */
       7                 :            : 
       8                 :            : /*
       9                 :            :  * This file handles the generic file mmap semantics used by
      10                 :            :  * most "normal" filesystems (but you don't /have/ to use this:
      11                 :            :  * the NFS filesystem used to do this differently, for example)
      12                 :            :  */
      13                 :            : #include <linux/export.h>
      14                 :            : #include <linux/compiler.h>
      15                 :            : #include <linux/dax.h>
      16                 :            : #include <linux/fs.h>
      17                 :            : #include <linux/sched/signal.h>
      18                 :            : #include <linux/uaccess.h>
      19                 :            : #include <linux/capability.h>
      20                 :            : #include <linux/kernel_stat.h>
      21                 :            : #include <linux/gfp.h>
      22                 :            : #include <linux/mm.h>
      23                 :            : #include <linux/swap.h>
      24                 :            : #include <linux/mman.h>
      25                 :            : #include <linux/pagemap.h>
      26                 :            : #include <linux/file.h>
      27                 :            : #include <linux/uio.h>
      28                 :            : #include <linux/error-injection.h>
      29                 :            : #include <linux/hash.h>
      30                 :            : #include <linux/writeback.h>
      31                 :            : #include <linux/backing-dev.h>
      32                 :            : #include <linux/pagevec.h>
      33                 :            : #include <linux/blkdev.h>
      34                 :            : #include <linux/security.h>
      35                 :            : #include <linux/cpuset.h>
      36                 :            : #include <linux/hugetlb.h>
      37                 :            : #include <linux/memcontrol.h>
      38                 :            : #include <linux/cleancache.h>
      39                 :            : #include <linux/shmem_fs.h>
      40                 :            : #include <linux/rmap.h>
      41                 :            : #include <linux/delayacct.h>
      42                 :            : #include <linux/psi.h>
      43                 :            : #include <linux/ramfs.h>
      44                 :            : #include "internal.h"
      45                 :            : 
      46                 :            : #define CREATE_TRACE_POINTS
      47                 :            : #include <trace/events/filemap.h>
      48                 :            : 
      49                 :            : /*
      50                 :            :  * FIXME: remove all knowledge of the buffer layer from the core VM
      51                 :            :  */
      52                 :            : #include <linux/buffer_head.h> /* for try_to_free_buffers */
      53                 :            : 
      54                 :            : #include <asm/mman.h>
      55                 :            : 
      56                 :            : /*
      57                 :            :  * Shared mappings implemented 30.11.1994. It's not fully working yet,
      58                 :            :  * though.
      59                 :            :  *
      60                 :            :  * Shared mappings now work. 15.8.1995  Bruno.
      61                 :            :  *
      62                 :            :  * finished 'unifying' the page and buffer cache and SMP-threaded the
      63                 :            :  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
      64                 :            :  *
      65                 :            :  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
      66                 :            :  */
      67                 :            : 
      68                 :            : /*
      69                 :            :  * Lock ordering:
      70                 :            :  *
      71                 :            :  *  ->i_mmap_rwsem           (truncate_pagecache)
      72                 :            :  *    ->private_lock         (__free_pte->__set_page_dirty_buffers)
      73                 :            :  *      ->swap_lock          (exclusive_swap_page, others)
      74                 :            :  *        ->i_pages lock
      75                 :            :  *
      76                 :            :  *  ->i_mutex
      77                 :            :  *    ->i_mmap_rwsem         (truncate->unmap_mapping_range)
      78                 :            :  *
      79                 :            :  *  ->mmap_sem
      80                 :            :  *    ->i_mmap_rwsem
      81                 :            :  *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
      82                 :            :  *        ->i_pages lock     (arch-dependent flush_dcache_mmap_lock)
      83                 :            :  *
      84                 :            :  *  ->mmap_sem
      85                 :            :  *    ->lock_page            (access_process_vm)
      86                 :            :  *
      87                 :            :  *  ->i_mutex                        (generic_perform_write)
      88                 :            :  *    ->mmap_sem             (fault_in_pages_readable->do_page_fault)
      89                 :            :  *
      90                 :            :  *  bdi->wb.list_lock
      91                 :            :  *    sb_lock                   (fs/fs-writeback.c)
      92                 :            :  *    ->i_pages lock         (__sync_single_inode)
      93                 :            :  *
      94                 :            :  *  ->i_mmap_rwsem
      95                 :            :  *    ->anon_vma.lock                (vma_adjust)
      96                 :            :  *
      97                 :            :  *  ->anon_vma.lock
      98                 :            :  *    ->page_table_lock or pte_lock  (anon_vma_prepare and various)
      99                 :            :  *
     100                 :            :  *  ->page_table_lock or pte_lock
     101                 :            :  *    ->swap_lock            (try_to_unmap_one)
     102                 :            :  *    ->private_lock         (try_to_unmap_one)
     103                 :            :  *    ->i_pages lock         (try_to_unmap_one)
     104                 :            :  *    ->pgdat->lru_lock           (follow_page->mark_page_accessed)
     105                 :            :  *    ->pgdat->lru_lock           (check_pte_range->isolate_lru_page)
     106                 :            :  *    ->private_lock         (page_remove_rmap->set_page_dirty)
     107                 :            :  *    ->i_pages lock         (page_remove_rmap->set_page_dirty)
     108                 :            :  *    bdi.wb->list_lock              (page_remove_rmap->set_page_dirty)
     109                 :            :  *    ->inode->i_lock             (page_remove_rmap->set_page_dirty)
     110                 :            :  *    ->memcg->move_lock  (page_remove_rmap->lock_page_memcg)
     111                 :            :  *    bdi.wb->list_lock              (zap_pte_range->set_page_dirty)
     112                 :            :  *    ->inode->i_lock             (zap_pte_range->set_page_dirty)
     113                 :            :  *    ->private_lock         (zap_pte_range->__set_page_dirty_buffers)
     114                 :            :  *
     115                 :            :  * ->i_mmap_rwsem
     116                 :            :  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
     117                 :            :  */
     118                 :            : 
     119                 :          3 : static void page_cache_delete(struct address_space *mapping,
     120                 :            :                                    struct page *page, void *shadow)
     121                 :            : {
     122                 :          3 :         XA_STATE(xas, &mapping->i_pages, page->index);
     123                 :            :         unsigned int nr = 1;
     124                 :            : 
     125                 :          3 :         mapping_set_update(&xas, mapping);
     126                 :            : 
     127                 :            :         /* hugetlb pages are represented by a single entry in the xarray */
     128                 :            :         if (!PageHuge(page)) {
     129                 :          3 :                 xas_set_order(&xas, page->index, compound_order(page));
     130                 :            :                 nr = compound_nr(page);
     131                 :            :         }
     132                 :            : 
     133                 :            :         VM_BUG_ON_PAGE(!PageLocked(page), page);
     134                 :            :         VM_BUG_ON_PAGE(PageTail(page), page);
     135                 :            :         VM_BUG_ON_PAGE(nr != 1 && shadow, page);
     136                 :            : 
     137                 :          3 :         xas_store(&xas, shadow);
     138                 :          3 :         xas_init_marks(&xas);
     139                 :            : 
     140                 :          3 :         page->mapping = NULL;
     141                 :            :         /* Leave page->index set: truncation lookup relies upon it */
     142                 :            : 
     143                 :          3 :         if (shadow) {
     144                 :          0 :                 mapping->nrexceptional += nr;
     145                 :            :                 /*
     146                 :            :                  * Make sure the nrexceptional update is committed before
     147                 :            :                  * the nrpages update so that final truncate racing
     148                 :            :                  * with reclaim does not see both counters 0 at the
     149                 :            :                  * same time and miss a shadow entry.
     150                 :            :                  */
     151                 :          0 :                 smp_wmb();
     152                 :            :         }
     153                 :          3 :         mapping->nrpages -= nr;
     154                 :          3 : }
     155                 :            : 
     156                 :          3 : static void unaccount_page_cache_page(struct address_space *mapping,
     157                 :            :                                       struct page *page)
     158                 :            : {
     159                 :            :         int nr;
     160                 :            : 
     161                 :            :         /*
     162                 :            :          * if we're uptodate, flush out into the cleancache, otherwise
     163                 :            :          * invalidate any existing cleancache entries.  We can't leave
     164                 :            :          * stale data around in the cleancache once our page is gone
     165                 :            :          */
     166                 :          3 :         if (PageUptodate(page) && PageMappedToDisk(page))
     167                 :          0 :                 cleancache_put_page(page);
     168                 :            :         else
     169                 :            :                 cleancache_invalidate_page(mapping, page);
     170                 :            : 
     171                 :            :         VM_BUG_ON_PAGE(PageTail(page), page);
     172                 :            :         VM_BUG_ON_PAGE(page_mapped(page), page);
     173                 :          3 :         if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
     174                 :            :                 int mapcount;
     175                 :            : 
     176                 :          0 :                 pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
     177                 :            :                          current->comm, page_to_pfn(page));
     178                 :          0 :                 dump_page(page, "still mapped when deleted");
     179                 :          0 :                 dump_stack();
     180                 :          0 :                 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     181                 :            : 
     182                 :          0 :                 mapcount = page_mapcount(page);
     183                 :          0 :                 if (mapping_exiting(mapping) &&
     184                 :          0 :                     page_count(page) >= mapcount + 2) {
     185                 :            :                         /*
     186                 :            :                          * All vmas have already been torn down, so it's
     187                 :            :                          * a good bet that actually the page is unmapped,
     188                 :            :                          * and we'd prefer not to leak it: if we're wrong,
     189                 :            :                          * some other bad page check should catch it later.
     190                 :            :                          */
     191                 :            :                         page_mapcount_reset(page);
     192                 :            :                         page_ref_sub(page, mapcount);
     193                 :            :                 }
     194                 :            :         }
     195                 :            : 
     196                 :            :         /* hugetlb pages do not participate in page cache accounting. */
     197                 :            :         if (PageHuge(page))
     198                 :          3 :                 return;
     199                 :            : 
     200                 :            :         nr = hpage_nr_pages(page);
     201                 :            : 
     202                 :          3 :         __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
     203                 :          3 :         if (PageSwapBacked(page)) {
     204                 :          3 :                 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
     205                 :            :                 if (PageTransHuge(page))
     206                 :            :                         __dec_node_page_state(page, NR_SHMEM_THPS);
     207                 :            :         } else if (PageTransHuge(page)) {
     208                 :            :                 __dec_node_page_state(page, NR_FILE_THPS);
     209                 :            :                 filemap_nr_thps_dec(mapping);
     210                 :            :         }
     211                 :            : 
     212                 :            :         /*
     213                 :            :          * At this point page must be either written or cleaned by
     214                 :            :          * truncate.  Dirty page here signals a bug and loss of
     215                 :            :          * unwritten data.
     216                 :            :          *
     217                 :            :          * This fixes dirty accounting after removing the page entirely
     218                 :            :          * but leaves PageDirty set: it has no effect for truncated
     219                 :            :          * page and anyway will be cleared before returning page into
     220                 :            :          * buddy allocator.
     221                 :            :          */
     222                 :          3 :         if (WARN_ON_ONCE(PageDirty(page)))
     223                 :          0 :                 account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
     224                 :            : }
     225                 :            : 
     226                 :            : /*
     227                 :            :  * Delete a page from the page cache and free it. Caller has to make
     228                 :            :  * sure the page is locked and that nobody else uses it - or that usage
     229                 :            :  * is safe.  The caller must hold the i_pages lock.
     230                 :            :  */
     231                 :          3 : void __delete_from_page_cache(struct page *page, void *shadow)
     232                 :            : {
     233                 :          3 :         struct address_space *mapping = page->mapping;
     234                 :            : 
     235                 :          3 :         trace_mm_filemap_delete_from_page_cache(page);
     236                 :            : 
     237                 :          3 :         unaccount_page_cache_page(mapping, page);
     238                 :          3 :         page_cache_delete(mapping, page, shadow);
     239                 :          3 : }
     240                 :            : 
     241                 :          3 : static void page_cache_free_page(struct address_space *mapping,
     242                 :            :                                 struct page *page)
     243                 :            : {
     244                 :            :         void (*freepage)(struct page *);
     245                 :            : 
     246                 :          3 :         freepage = mapping->a_ops->freepage;
     247                 :          3 :         if (freepage)
     248                 :          0 :                 freepage(page);
     249                 :            : 
     250                 :            :         if (PageTransHuge(page) && !PageHuge(page)) {
     251                 :            :                 page_ref_sub(page, HPAGE_PMD_NR);
     252                 :            :                 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
     253                 :            :         } else {
     254                 :          3 :                 put_page(page);
     255                 :            :         }
     256                 :          3 : }
     257                 :            : 
     258                 :            : /**
     259                 :            :  * delete_from_page_cache - delete page from page cache
     260                 :            :  * @page: the page which the kernel is trying to remove from page cache
     261                 :            :  *
     262                 :            :  * This must be called only on pages that have been verified to be in the page
     263                 :            :  * cache and locked.  It will never put the page into the free list, the caller
     264                 :            :  * has a reference on the page.
     265                 :            :  */
     266                 :          3 : void delete_from_page_cache(struct page *page)
     267                 :            : {
     268                 :          3 :         struct address_space *mapping = page_mapping(page);
     269                 :            :         unsigned long flags;
     270                 :            : 
     271                 :          3 :         BUG_ON(!PageLocked(page));
     272                 :          3 :         xa_lock_irqsave(&mapping->i_pages, flags);
     273                 :          3 :         __delete_from_page_cache(page, NULL);
     274                 :            :         xa_unlock_irqrestore(&mapping->i_pages, flags);
     275                 :            : 
     276                 :          3 :         page_cache_free_page(mapping, page);
     277                 :          3 : }
     278                 :            : EXPORT_SYMBOL(delete_from_page_cache);
     279                 :            : 
     280                 :            : /*
     281                 :            :  * page_cache_delete_batch - delete several pages from page cache
     282                 :            :  * @mapping: the mapping to which pages belong
     283                 :            :  * @pvec: pagevec with pages to delete
     284                 :            :  *
     285                 :            :  * The function walks over mapping->i_pages and removes pages passed in @pvec
     286                 :            :  * from the mapping. The function expects @pvec to be sorted by page index
     287                 :            :  * and is optimised for it to be dense.
     288                 :            :  * It tolerates holes in @pvec (mapping entries at those indices are not
     289                 :            :  * modified). The function expects only THP head pages to be present in the
     290                 :            :  * @pvec.
     291                 :            :  *
     292                 :            :  * The function expects the i_pages lock to be held.
     293                 :            :  */
     294                 :          3 : static void page_cache_delete_batch(struct address_space *mapping,
     295                 :            :                              struct pagevec *pvec)
     296                 :            : {
     297                 :          3 :         XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
     298                 :            :         int total_pages = 0;
     299                 :            :         int i = 0;
     300                 :            :         struct page *page;
     301                 :            : 
     302                 :          3 :         mapping_set_update(&xas, mapping);
     303                 :          3 :         xas_for_each(&xas, page, ULONG_MAX) {
     304                 :          3 :                 if (i >= pagevec_count(pvec))
     305                 :            :                         break;
     306                 :            : 
     307                 :            :                 /* A swap/dax/shadow entry got inserted? Skip it. */
     308                 :          3 :                 if (xa_is_value(page))
     309                 :          0 :                         continue;
     310                 :            :                 /*
     311                 :            :                  * A page got inserted in our range? Skip it. We have our
     312                 :            :                  * pages locked so they are protected from being removed.
     313                 :            :                  * If we see a page whose index is higher than ours, it
     314                 :            :                  * means our page has been removed, which shouldn't be
     315                 :            :                  * possible because we're holding the PageLock.
     316                 :            :                  */
     317                 :          3 :                 if (page != pvec->pages[i]) {
     318                 :            :                         VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
     319                 :            :                                         page);
     320                 :          2 :                         continue;
     321                 :            :                 }
     322                 :            : 
     323                 :          3 :                 WARN_ON_ONCE(!PageLocked(page));
     324                 :            : 
     325                 :          3 :                 if (page->index == xas.xa_index)
     326                 :          3 :                         page->mapping = NULL;
     327                 :            :                 /* Leave page->index set: truncation lookup relies on it */
     328                 :            : 
     329                 :            :                 /*
     330                 :            :                  * Move to the next page in the vector if this is a regular
     331                 :            :                  * page or the index is of the last sub-page of this compound
     332                 :            :                  * page.
     333                 :            :                  */
     334                 :          3 :                 if (page->index + compound_nr(page) - 1 == xas.xa_index)
     335                 :          3 :                         i++;
     336                 :          3 :                 xas_store(&xas, NULL);
     337                 :          3 :                 total_pages++;
     338                 :            :         }
     339                 :          3 :         mapping->nrpages -= total_pages;
     340                 :          3 : }
     341                 :            : 
     342                 :          3 : void delete_from_page_cache_batch(struct address_space *mapping,
     343                 :            :                                   struct pagevec *pvec)
     344                 :            : {
     345                 :            :         int i;
     346                 :            :         unsigned long flags;
     347                 :            : 
     348                 :          3 :         if (!pagevec_count(pvec))
     349                 :          3 :                 return;
     350                 :            : 
     351                 :          3 :         xa_lock_irqsave(&mapping->i_pages, flags);
     352                 :          3 :         for (i = 0; i < pagevec_count(pvec); i++) {
     353                 :          3 :                 trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
     354                 :            : 
     355                 :          3 :                 unaccount_page_cache_page(mapping, pvec->pages[i]);
     356                 :            :         }
     357                 :          3 :         page_cache_delete_batch(mapping, pvec);
     358                 :            :         xa_unlock_irqrestore(&mapping->i_pages, flags);
     359                 :            : 
     360                 :          3 :         for (i = 0; i < pagevec_count(pvec); i++)
     361                 :          3 :                 page_cache_free_page(mapping, pvec->pages[i]);
     362                 :            : }
     363                 :            : 
     364                 :          3 : int filemap_check_errors(struct address_space *mapping)
     365                 :            : {
     366                 :            :         int ret = 0;
     367                 :            :         /* Check for outstanding write errors */
     368                 :          3 :         if (test_bit(AS_ENOSPC, &mapping->flags) &&
     369                 :          0 :             test_and_clear_bit(AS_ENOSPC, &mapping->flags))
     370                 :            :                 ret = -ENOSPC;
     371                 :          3 :         if (test_bit(AS_EIO, &mapping->flags) &&
     372                 :          0 :             test_and_clear_bit(AS_EIO, &mapping->flags))
     373                 :            :                 ret = -EIO;
     374                 :          3 :         return ret;
     375                 :            : }
     376                 :            : EXPORT_SYMBOL(filemap_check_errors);
     377                 :            : 
     378                 :            : static int filemap_check_and_keep_errors(struct address_space *mapping)
     379                 :            : {
     380                 :            :         /* Check for outstanding write errors */
     381                 :          3 :         if (test_bit(AS_EIO, &mapping->flags))
     382                 :            :                 return -EIO;
     383                 :          3 :         if (test_bit(AS_ENOSPC, &mapping->flags))
     384                 :            :                 return -ENOSPC;
     385                 :            :         return 0;
     386                 :            : }
     387                 :            : 
     388                 :            : /**
     389                 :            :  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
     390                 :            :  * @mapping:    address space structure to write
     391                 :            :  * @start:      offset in bytes where the range starts
     392                 :            :  * @end:        offset in bytes where the range ends (inclusive)
     393                 :            :  * @sync_mode:  enable synchronous operation
     394                 :            :  *
     395                 :            :  * Start writeback against all of a mapping's dirty pages that lie
     396                 :            :  * within the byte offsets <start, end> inclusive.
     397                 :            :  *
     398                 :            :  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
     399                 :            :  * opposed to a regular memory cleansing writeback.  The difference between
     400                 :            :  * these two operations is that if a dirty page/buffer is encountered, it must
     401                 :            :  * be waited upon, and not just skipped over.
     402                 :            :  *
     403                 :            :  * Return: %0 on success, negative error code otherwise.
     404                 :            :  */
     405                 :          3 : int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     406                 :            :                                 loff_t end, int sync_mode)
     407                 :            : {
     408                 :            :         int ret;
     409                 :          3 :         struct writeback_control wbc = {
     410                 :            :                 .sync_mode = sync_mode,
     411                 :            :                 .nr_to_write = LONG_MAX,
     412                 :            :                 .range_start = start,
     413                 :            :                 .range_end = end,
     414                 :            :         };
     415                 :            : 
     416                 :          3 :         if (!mapping_cap_writeback_dirty(mapping) ||
     417                 :            :             !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
     418                 :            :                 return 0;
     419                 :            : 
     420                 :          3 :         wbc_attach_fdatawrite_inode(&wbc, mapping->host);
     421                 :          3 :         ret = do_writepages(mapping, &wbc);
     422                 :          3 :         wbc_detach_inode(&wbc);
     423                 :          3 :         return ret;
     424                 :            : }
     425                 :            : 
     426                 :            : static inline int __filemap_fdatawrite(struct address_space *mapping,
     427                 :            :         int sync_mode)
     428                 :            : {
     429                 :          3 :         return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
     430                 :            : }
     431                 :            : 
     432                 :          3 : int filemap_fdatawrite(struct address_space *mapping)
     433                 :            : {
     434                 :          3 :         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
     435                 :            : }
     436                 :            : EXPORT_SYMBOL(filemap_fdatawrite);
     437                 :            : 
     438                 :          0 : int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
     439                 :            :                                 loff_t end)
     440                 :            : {
     441                 :          0 :         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
     442                 :            : }
     443                 :            : EXPORT_SYMBOL(filemap_fdatawrite_range);
     444                 :            : 
     445                 :            : /**
     446                 :            :  * filemap_flush - mostly a non-blocking flush
     447                 :            :  * @mapping:    target address_space
     448                 :            :  *
     449                 :            :  * This is a mostly non-blocking flush.  Not suitable for data-integrity
     450                 :            :  * purposes - I/O may not be started against all dirty pages.
     451                 :            :  *
     452                 :            :  * Return: %0 on success, negative error code otherwise.
     453                 :            :  */
     454                 :          3 : int filemap_flush(struct address_space *mapping)
     455                 :            : {
     456                 :          3 :         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
     457                 :            : }
     458                 :            : EXPORT_SYMBOL(filemap_flush);
     459                 :            : 
     460                 :            : /**
     461                 :            :  * filemap_range_has_page - check if a page exists in range.
     462                 :            :  * @mapping:           address space within which to check
     463                 :            :  * @start_byte:        offset in bytes where the range starts
     464                 :            :  * @end_byte:          offset in bytes where the range ends (inclusive)
     465                 :            :  *
     466                 :            :  * Find at least one page in the range supplied, usually used to check if
     467                 :            :  * direct writing in this range will trigger a writeback.
     468                 :            :  *
     469                 :            :  * Return: %true if at least one page exists in the specified range,
     470                 :            :  * %false otherwise.
     471                 :            :  */
     472                 :          0 : bool filemap_range_has_page(struct address_space *mapping,
     473                 :            :                            loff_t start_byte, loff_t end_byte)
     474                 :            : {
     475                 :            :         struct page *page;
     476                 :          0 :         XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
     477                 :          0 :         pgoff_t max = end_byte >> PAGE_SHIFT;
     478                 :            : 
     479                 :          0 :         if (end_byte < start_byte)
     480                 :            :                 return false;
     481                 :            : 
     482                 :            :         rcu_read_lock();
     483                 :            :         for (;;) {
     484                 :          0 :                 page = xas_find(&xas, max);
     485                 :          0 :                 if (xas_retry(&xas, page))
     486                 :          0 :                         continue;
     487                 :            :                 /* Shadow entries don't count */
     488                 :          0 :                 if (xa_is_value(page))
     489                 :          0 :                         continue;
     490                 :            :                 /*
     491                 :            :                  * We don't need to try to pin this page; we're about to
     492                 :            :                  * release the RCU lock anyway.  It is enough to know that
     493                 :            :                  * there was a page here recently.
     494                 :            :                  */
     495                 :            :                 break;
     496                 :            :         }
     497                 :            :         rcu_read_unlock();
     498                 :            : 
     499                 :          0 :         return page != NULL;
     500                 :            : }
     501                 :            : EXPORT_SYMBOL(filemap_range_has_page);
     502                 :            : 
     503                 :          3 : static void __filemap_fdatawait_range(struct address_space *mapping,
     504                 :            :                                      loff_t start_byte, loff_t end_byte)
     505                 :            : {
     506                 :          3 :         pgoff_t index = start_byte >> PAGE_SHIFT;
     507                 :          3 :         pgoff_t end = end_byte >> PAGE_SHIFT;
     508                 :            :         struct pagevec pvec;
     509                 :            :         int nr_pages;
     510                 :            : 
     511                 :          3 :         if (end_byte < start_byte)
     512                 :          0 :                 return;
     513                 :            : 
     514                 :            :         pagevec_init(&pvec);
     515                 :          3 :         while (index <= end) {
     516                 :            :                 unsigned i;
     517                 :            : 
     518                 :          3 :                 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
     519                 :            :                                 end, PAGECACHE_TAG_WRITEBACK);
     520                 :          3 :                 if (!nr_pages)
     521                 :            :                         break;
     522                 :            : 
     523                 :          3 :                 for (i = 0; i < nr_pages; i++) {
     524                 :          3 :                         struct page *page = pvec.pages[i];
     525                 :            : 
     526                 :          3 :                         wait_on_page_writeback(page);
     527                 :            :                         ClearPageError(page);
     528                 :            :                 }
     529                 :            :                 pagevec_release(&pvec);
     530                 :          3 :                 cond_resched();
     531                 :            :         }
     532                 :            : }
     533                 :            : 
     534                 :            : /**
     535                 :            :  * filemap_fdatawait_range - wait for writeback to complete
     536                 :            :  * @mapping:            address space structure to wait for
     537                 :            :  * @start_byte:         offset in bytes where the range starts
     538                 :            :  * @end_byte:           offset in bytes where the range ends (inclusive)
     539                 :            :  *
     540                 :            :  * Walk the list of under-writeback pages of the given address space
     541                 :            :  * in the given range and wait for all of them.  Check error status of
     542                 :            :  * the address space and return it.
     543                 :            :  *
     544                 :            :  * Since the error status of the address space is cleared by this function,
     545                 :            :  * callers are responsible for checking the return value and handling and/or
     546                 :            :  * reporting the error.
     547                 :            :  *
     548                 :            :  * Return: error status of the address space.
     549                 :            :  */
     550                 :          0 : int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
     551                 :            :                             loff_t end_byte)
     552                 :            : {
     553                 :          3 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     554                 :          3 :         return filemap_check_errors(mapping);
     555                 :            : }
     556                 :            : EXPORT_SYMBOL(filemap_fdatawait_range);
     557                 :            : 
     558                 :            : /**
     559                 :            :  * filemap_fdatawait_range_keep_errors - wait for writeback to complete
     560                 :            :  * @mapping:            address space structure to wait for
     561                 :            :  * @start_byte:         offset in bytes where the range starts
     562                 :            :  * @end_byte:           offset in bytes where the range ends (inclusive)
     563                 :            :  *
     564                 :            :  * Walk the list of under-writeback pages of the given address space in the
     565                 :            :  * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
     566                 :            :  * this function does not clear error status of the address space.
     567                 :            :  *
     568                 :            :  * Use this function if callers don't handle errors themselves.  Expected
     569                 :            :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     570                 :            :  * fsfreeze(8)
     571                 :            :  */
     572                 :          3 : int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
     573                 :            :                 loff_t start_byte, loff_t end_byte)
     574                 :            : {
     575                 :          3 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     576                 :          3 :         return filemap_check_and_keep_errors(mapping);
     577                 :            : }
     578                 :            : EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
     579                 :            : 
     580                 :            : /**
     581                 :            :  * file_fdatawait_range - wait for writeback to complete
     582                 :            :  * @file:               file pointing to address space structure to wait for
     583                 :            :  * @start_byte:         offset in bytes where the range starts
     584                 :            :  * @end_byte:           offset in bytes where the range ends (inclusive)
     585                 :            :  *
     586                 :            :  * Walk the list of under-writeback pages of the address space that file
     587                 :            :  * refers to, in the given range and wait for all of them.  Check error
     588                 :            :  * status of the address space vs. the file->f_wb_err cursor and return it.
     589                 :            :  *
     590                 :            :  * Since the error status of the file is advanced by this function,
     591                 :            :  * callers are responsible for checking the return value and handling and/or
     592                 :            :  * reporting the error.
     593                 :            :  *
     594                 :            :  * Return: error status of the address space vs. the file->f_wb_err cursor.
     595                 :            :  */
     596                 :          0 : int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
     597                 :            : {
     598                 :          0 :         struct address_space *mapping = file->f_mapping;
     599                 :            : 
     600                 :          0 :         __filemap_fdatawait_range(mapping, start_byte, end_byte);
     601                 :          0 :         return file_check_and_advance_wb_err(file);
     602                 :            : }
     603                 :            : EXPORT_SYMBOL(file_fdatawait_range);
     604                 :            : 
     605                 :            : /**
     606                 :            :  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
     607                 :            :  * @mapping: address space structure to wait for
     608                 :            :  *
     609                 :            :  * Walk the list of under-writeback pages of the given address space
     610                 :            :  * and wait for all of them.  Unlike filemap_fdatawait(), this function
     611                 :            :  * does not clear error status of the address space.
     612                 :            :  *
     613                 :            :  * Use this function if callers don't handle errors themselves.  Expected
     614                 :            :  * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
     615                 :            :  * fsfreeze(8)
     616                 :            :  *
     617                 :            :  * Return: error status of the address space.
     618                 :            :  */
     619                 :          3 : int filemap_fdatawait_keep_errors(struct address_space *mapping)
     620                 :            : {
     621                 :          3 :         __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
     622                 :          3 :         return filemap_check_and_keep_errors(mapping);
     623                 :            : }
     624                 :            : EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
     625                 :            : 
     626                 :            : /* Returns true if writeback might be needed or already in progress. */
     627                 :            : static bool mapping_needs_writeback(struct address_space *mapping)
     628                 :            : {
     629                 :            :         if (dax_mapping(mapping))
     630                 :            :                 return mapping->nrexceptional;
     631                 :            : 
     632                 :          3 :         return mapping->nrpages;
     633                 :            : }
     634                 :            : 
     635                 :          3 : int filemap_write_and_wait(struct address_space *mapping)
     636                 :            : {
     637                 :            :         int err = 0;
     638                 :            : 
     639                 :          3 :         if (mapping_needs_writeback(mapping)) {
     640                 :            :                 err = filemap_fdatawrite(mapping);
     641                 :            :                 /*
     642                 :            :                  * Even if the above returned error, the pages may be
     643                 :            :                  * written partially (e.g. -ENOSPC), so we wait for it.
     644                 :            :                  * But the -EIO is special case, it may indicate the worst
     645                 :            :                  * thing (e.g. bug) happened, so we avoid waiting for it.
     646                 :            :                  */
     647                 :          3 :                 if (err != -EIO) {
     648                 :          3 :                         int err2 = filemap_fdatawait(mapping);
     649                 :          3 :                         if (!err)
     650                 :            :                                 err = err2;
     651                 :            :                 } else {
     652                 :            :                         /* Clear any previously stored errors */
     653                 :          0 :                         filemap_check_errors(mapping);
     654                 :            :                 }
     655                 :            :         } else {
     656                 :          3 :                 err = filemap_check_errors(mapping);
     657                 :            :         }
     658                 :          3 :         return err;
     659                 :            : }
     660                 :            : EXPORT_SYMBOL(filemap_write_and_wait);
     661                 :            : 
     662                 :            : /**
     663                 :            :  * filemap_write_and_wait_range - write out & wait on a file range
     664                 :            :  * @mapping:    the address_space for the pages
     665                 :            :  * @lstart:     offset in bytes where the range starts
     666                 :            :  * @lend:       offset in bytes where the range ends (inclusive)
     667                 :            :  *
     668                 :            :  * Write out and wait upon file offsets lstart->lend, inclusive.
     669                 :            :  *
     670                 :            :  * Note that @lend is inclusive (describes the last byte to be written) so
     671                 :            :  * that this function can be used to write to the very end-of-file (end = -1).
     672                 :            :  *
     673                 :            :  * Return: error status of the address space.
     674                 :            :  */
     675                 :          0 : int filemap_write_and_wait_range(struct address_space *mapping,
     676                 :            :                                  loff_t lstart, loff_t lend)
     677                 :            : {
     678                 :            :         int err = 0;
     679                 :            : 
     680                 :          0 :         if (mapping_needs_writeback(mapping)) {
     681                 :          0 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     682                 :            :                                                  WB_SYNC_ALL);
     683                 :            :                 /* See comment of filemap_write_and_wait() */
     684                 :          0 :                 if (err != -EIO) {
     685                 :            :                         int err2 = filemap_fdatawait_range(mapping,
     686                 :            :                                                 lstart, lend);
     687                 :          0 :                         if (!err)
     688                 :            :                                 err = err2;
     689                 :            :                 } else {
     690                 :            :                         /* Clear any previously stored errors */
     691                 :          0 :                         filemap_check_errors(mapping);
     692                 :            :                 }
     693                 :            :         } else {
     694                 :          0 :                 err = filemap_check_errors(mapping);
     695                 :            :         }
     696                 :          0 :         return err;
     697                 :            : }
     698                 :            : EXPORT_SYMBOL(filemap_write_and_wait_range);
     699                 :            : 
     700                 :          0 : void __filemap_set_wb_err(struct address_space *mapping, int err)
     701                 :            : {
     702                 :          0 :         errseq_t eseq = errseq_set(&mapping->wb_err, err);
     703                 :            : 
     704                 :          0 :         trace_filemap_set_wb_err(mapping, eseq);
     705                 :          0 : }
     706                 :            : EXPORT_SYMBOL(__filemap_set_wb_err);
     707                 :            : 
     708                 :            : /**
     709                 :            :  * file_check_and_advance_wb_err - report wb error (if any) that was previously
     710                 :            :  *                                 and advance wb_err to current one
     711                 :            :  * @file: struct file on which the error is being reported
     712                 :            :  *
     713                 :            :  * When userland calls fsync (or something like nfsd does the equivalent), we
     714                 :            :  * want to report any writeback errors that occurred since the last fsync (or
     715                 :            :  * since the file was opened if there haven't been any).
     716                 :            :  *
     717                 :            :  * Grab the wb_err from the mapping. If it matches what we have in the file,
     718                 :            :  * then just quickly return 0. The file is all caught up.
     719                 :            :  *
     720                 :            :  * If it doesn't match, then take the mapping value, set the "seen" flag in
     721                 :            :  * it and try to swap it into place. If it works, or another task beat us
     722                 :            :  * to it with the new value, then update the f_wb_err and return the error
     723                 :            :  * portion. The error at this point must be reported via proper channels
     724                 :            :  * (a'la fsync, or NFS COMMIT operation, etc.).
     725                 :            :  *
     726                 :            :  * While we handle mapping->wb_err with atomic operations, the f_wb_err
     727                 :            :  * value is protected by the f_lock since we must ensure that it reflects
     728                 :            :  * the latest value swapped in for this file descriptor.
     729                 :            :  *
     730                 :            :  * Return: %0 on success, negative error code otherwise.
     731                 :            :  */
     732                 :          3 : int file_check_and_advance_wb_err(struct file *file)
     733                 :            : {
     734                 :            :         int err = 0;
     735                 :            :         errseq_t old = READ_ONCE(file->f_wb_err);
     736                 :          3 :         struct address_space *mapping = file->f_mapping;
     737                 :            : 
     738                 :            :         /* Locklessly handle the common case where nothing has changed */
     739                 :          3 :         if (errseq_check(&mapping->wb_err, old)) {
     740                 :            :                 /* Something changed, must use slow path */
     741                 :            :                 spin_lock(&file->f_lock);
     742                 :          0 :                 old = file->f_wb_err;
     743                 :          0 :                 err = errseq_check_and_advance(&mapping->wb_err,
     744                 :            :                                                 &file->f_wb_err);
     745                 :          0 :                 trace_file_check_and_advance_wb_err(file, old);
     746                 :            :                 spin_unlock(&file->f_lock);
     747                 :            :         }
     748                 :            : 
     749                 :            :         /*
     750                 :            :          * We're mostly using this function as a drop in replacement for
     751                 :            :          * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
     752                 :            :          * that the legacy code would have had on these flags.
     753                 :            :          */
     754                 :          3 :         clear_bit(AS_EIO, &mapping->flags);
     755                 :          3 :         clear_bit(AS_ENOSPC, &mapping->flags);
     756                 :          3 :         return err;
     757                 :            : }
     758                 :            : EXPORT_SYMBOL(file_check_and_advance_wb_err);
     759                 :            : 
     760                 :            : /**
     761                 :            :  * file_write_and_wait_range - write out & wait on a file range
     762                 :            :  * @file:       file pointing to address_space with pages
     763                 :            :  * @lstart:     offset in bytes where the range starts
     764                 :            :  * @lend:       offset in bytes where the range ends (inclusive)
     765                 :            :  *
     766                 :            :  * Write out and wait upon file offsets lstart->lend, inclusive.
     767                 :            :  *
     768                 :            :  * Note that @lend is inclusive (describes the last byte to be written) so
     769                 :            :  * that this function can be used to write to the very end-of-file (end = -1).
     770                 :            :  *
     771                 :            :  * After writing out and waiting on the data, we check and advance the
     772                 :            :  * f_wb_err cursor to the latest value, and return any errors detected there.
     773                 :            :  *
     774                 :            :  * Return: %0 on success, negative error code otherwise.
     775                 :            :  */
     776                 :          3 : int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
     777                 :            : {
     778                 :            :         int err = 0, err2;
     779                 :          3 :         struct address_space *mapping = file->f_mapping;
     780                 :            : 
     781                 :          3 :         if (mapping_needs_writeback(mapping)) {
     782                 :          3 :                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
     783                 :            :                                                  WB_SYNC_ALL);
     784                 :            :                 /* See comment of filemap_write_and_wait() */
     785                 :          3 :                 if (err != -EIO)
     786                 :          3 :                         __filemap_fdatawait_range(mapping, lstart, lend);
     787                 :            :         }
     788                 :          3 :         err2 = file_check_and_advance_wb_err(file);
     789                 :          3 :         if (!err)
     790                 :            :                 err = err2;
     791                 :          3 :         return err;
     792                 :            : }
     793                 :            : EXPORT_SYMBOL(file_write_and_wait_range);
     794                 :            : 
     795                 :            : /**
     796                 :            :  * replace_page_cache_page - replace a pagecache page with a new one
     797                 :            :  * @old:        page to be replaced
     798                 :            :  * @new:        page to replace with
     799                 :            :  * @gfp_mask:   allocation mode
     800                 :            :  *
     801                 :            :  * This function replaces a page in the pagecache with a new one.  On
     802                 :            :  * success it acquires the pagecache reference for the new page and
     803                 :            :  * drops it for the old page.  Both the old and new pages must be
     804                 :            :  * locked.  This function does not add the new page to the LRU, the
     805                 :            :  * caller must do that.
     806                 :            :  *
     807                 :            :  * The remove + add is atomic.  This function cannot fail.
     808                 :            :  *
     809                 :            :  * Return: %0
     810                 :            :  */
     811                 :          0 : int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
     812                 :            : {
     813                 :          0 :         struct address_space *mapping = old->mapping;
     814                 :          0 :         void (*freepage)(struct page *) = mapping->a_ops->freepage;
     815                 :          0 :         pgoff_t offset = old->index;
     816                 :          0 :         XA_STATE(xas, &mapping->i_pages, offset);
     817                 :            :         unsigned long flags;
     818                 :            : 
     819                 :            :         VM_BUG_ON_PAGE(!PageLocked(old), old);
     820                 :            :         VM_BUG_ON_PAGE(!PageLocked(new), new);
     821                 :            :         VM_BUG_ON_PAGE(new->mapping, new);
     822                 :            : 
     823                 :          0 :         get_page(new);
     824                 :          0 :         new->mapping = mapping;
     825                 :          0 :         new->index = offset;
     826                 :            : 
     827                 :          0 :         xas_lock_irqsave(&xas, flags);
     828                 :          0 :         xas_store(&xas, new);
     829                 :            : 
     830                 :          0 :         old->mapping = NULL;
     831                 :            :         /* hugetlb pages do not participate in page cache accounting. */
     832                 :            :         if (!PageHuge(old))
     833                 :          0 :                 __dec_node_page_state(new, NR_FILE_PAGES);
     834                 :            :         if (!PageHuge(new))
     835                 :          0 :                 __inc_node_page_state(new, NR_FILE_PAGES);
     836                 :          0 :         if (PageSwapBacked(old))
     837                 :          0 :                 __dec_node_page_state(new, NR_SHMEM);
     838                 :          0 :         if (PageSwapBacked(new))
     839                 :          0 :                 __inc_node_page_state(new, NR_SHMEM);
     840                 :          0 :         xas_unlock_irqrestore(&xas, flags);
     841                 :          0 :         mem_cgroup_migrate(old, new);
     842                 :          0 :         if (freepage)
     843                 :          0 :                 freepage(old);
     844                 :          0 :         put_page(old);
     845                 :            : 
     846                 :          0 :         return 0;
     847                 :            : }
     848                 :            : EXPORT_SYMBOL_GPL(replace_page_cache_page);
     849                 :            : 
     850                 :          3 : static int __add_to_page_cache_locked(struct page *page,
     851                 :            :                                       struct address_space *mapping,
     852                 :            :                                       pgoff_t offset, gfp_t gfp_mask,
     853                 :            :                                       void **shadowp)
     854                 :            : {
     855                 :          3 :         XA_STATE(xas, &mapping->i_pages, offset);
     856                 :            :         int huge = PageHuge(page);
     857                 :            :         struct mem_cgroup *memcg;
     858                 :            :         int error;
     859                 :            :         void *old;
     860                 :            : 
     861                 :            :         VM_BUG_ON_PAGE(!PageLocked(page), page);
     862                 :            :         VM_BUG_ON_PAGE(PageSwapBacked(page), page);
     863                 :          3 :         mapping_set_update(&xas, mapping);
     864                 :            : 
     865                 :            :         if (!huge) {
     866                 :          3 :                 error = mem_cgroup_try_charge(page, current->mm,
     867                 :            :                                               gfp_mask, &memcg, false);
     868                 :          3 :                 if (error)
     869                 :            :                         return error;
     870                 :            :         }
     871                 :            : 
     872                 :          3 :         get_page(page);
     873                 :          3 :         page->mapping = mapping;
     874                 :          3 :         page->index = offset;
     875                 :            : 
     876                 :            :         do {
     877                 :          3 :                 xas_lock_irq(&xas);
     878                 :          3 :                 old = xas_load(&xas);
     879                 :          3 :                 if (old && !xa_is_value(old))
     880                 :            :                         xas_set_err(&xas, -EEXIST);
     881                 :          3 :                 xas_store(&xas, page);
     882                 :          3 :                 if (xas_error(&xas))
     883                 :            :                         goto unlock;
     884                 :            : 
     885                 :          3 :                 if (xa_is_value(old)) {
     886                 :          0 :                         mapping->nrexceptional--;
     887                 :          0 :                         if (shadowp)
     888                 :          0 :                                 *shadowp = old;
     889                 :            :                 }
     890                 :          3 :                 mapping->nrpages++;
     891                 :            : 
     892                 :            :                 /* hugetlb pages do not participate in page cache accounting */
     893                 :            :                 if (!huge)
     894                 :          3 :                         __inc_node_page_state(page, NR_FILE_PAGES);
     895                 :            : unlock:
     896                 :          3 :                 xas_unlock_irq(&xas);
     897                 :          3 :         } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
     898                 :            : 
     899                 :          3 :         if (xas_error(&xas))
     900                 :            :                 goto error;
     901                 :            : 
     902                 :            :         if (!huge)
     903                 :          3 :                 mem_cgroup_commit_charge(page, memcg, false, false);
     904                 :          3 :         trace_mm_filemap_add_to_page_cache(page);
     905                 :          3 :         return 0;
     906                 :            : error:
     907                 :          3 :         page->mapping = NULL;
     908                 :            :         /* Leave page->index set: truncation relies upon it */
     909                 :            :         if (!huge)
     910                 :          3 :                 mem_cgroup_cancel_charge(page, memcg, false);
     911                 :          3 :         put_page(page);
     912                 :          3 :         return xas_error(&xas);
     913                 :            : }
     914                 :            : ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
     915                 :            : 
     916                 :            : /**
     917                 :            :  * add_to_page_cache_locked - add a locked page to the pagecache
     918                 :            :  * @page:       page to add
     919                 :            :  * @mapping:    the page's address_space
     920                 :            :  * @offset:     page index
     921                 :            :  * @gfp_mask:   page allocation mode
     922                 :            :  *
     923                 :            :  * This function is used to add a page to the pagecache. It must be locked.
     924                 :            :  * This function does not add the page to the LRU.  The caller must do that.
     925                 :            :  *
     926                 :            :  * Return: %0 on success, negative error code otherwise.
     927                 :            :  */
     928                 :          0 : int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
     929                 :            :                 pgoff_t offset, gfp_t gfp_mask)
     930                 :            : {
     931                 :          0 :         return __add_to_page_cache_locked(page, mapping, offset,
     932                 :            :                                           gfp_mask, NULL);
     933                 :            : }
     934                 :            : EXPORT_SYMBOL(add_to_page_cache_locked);
     935                 :            : 
     936                 :          3 : int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
     937                 :            :                                 pgoff_t offset, gfp_t gfp_mask)
     938                 :            : {
     939                 :          3 :         void *shadow = NULL;
     940                 :            :         int ret;
     941                 :            : 
     942                 :            :         __SetPageLocked(page);
     943                 :          3 :         ret = __add_to_page_cache_locked(page, mapping, offset,
     944                 :            :                                          gfp_mask, &shadow);
     945                 :          3 :         if (unlikely(ret))
     946                 :            :                 __ClearPageLocked(page);
     947                 :            :         else {
     948                 :            :                 /*
     949                 :            :                  * The page might have been evicted from cache only
     950                 :            :                  * recently, in which case it should be activated like
     951                 :            :                  * any other repeatedly accessed page.
     952                 :            :                  * The exception is pages getting rewritten; evicting other
     953                 :            :                  * data from the working set, only to cache data that will
     954                 :            :                  * get overwritten with something else, is a waste of memory.
     955                 :            :                  */
     956                 :          3 :                 WARN_ON_ONCE(PageActive(page));
     957                 :          3 :                 if (!(gfp_mask & __GFP_WRITE) && shadow)
     958                 :          0 :                         workingset_refault(page, shadow);
     959                 :          3 :                 lru_cache_add(page);
     960                 :            :         }
     961                 :          3 :         return ret;
     962                 :            : }
     963                 :            : EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
     964                 :            : 
     965                 :            : #ifdef CONFIG_NUMA
     966                 :            : struct page *__page_cache_alloc(gfp_t gfp)
     967                 :            : {
     968                 :            :         int n;
     969                 :            :         struct page *page;
     970                 :            : 
     971                 :            :         if (cpuset_do_page_mem_spread()) {
     972                 :            :                 unsigned int cpuset_mems_cookie;
     973                 :            :                 do {
     974                 :            :                         cpuset_mems_cookie = read_mems_allowed_begin();
     975                 :            :                         n = cpuset_mem_spread_node();
     976                 :            :                         page = __alloc_pages_node(n, gfp, 0);
     977                 :            :                 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
     978                 :            : 
     979                 :            :                 return page;
     980                 :            :         }
     981                 :            :         return alloc_pages(gfp, 0);
     982                 :            : }
     983                 :            : EXPORT_SYMBOL(__page_cache_alloc);
     984                 :            : #endif
     985                 :            : 
     986                 :            : /*
     987                 :            :  * In order to wait for pages to become available there must be
     988                 :            :  * waitqueues associated with pages. By using a hash table of
     989                 :            :  * waitqueues where the bucket discipline is to maintain all
     990                 :            :  * waiters on the same queue and wake all when any of the pages
     991                 :            :  * become available, and for the woken contexts to check to be
     992                 :            :  * sure the appropriate page became available, this saves space
     993                 :            :  * at a cost of "thundering herd" phenomena during rare hash
     994                 :            :  * collisions.
     995                 :            :  */
     996                 :            : #define PAGE_WAIT_TABLE_BITS 8
     997                 :            : #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
     998                 :            : static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
     999                 :            : 
    1000                 :            : static wait_queue_head_t *page_waitqueue(struct page *page)
    1001                 :            : {
    1002                 :          3 :         return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
    1003                 :            : }
    1004                 :            : 
    1005                 :          3 : void __init pagecache_init(void)
    1006                 :            : {
    1007                 :            :         int i;
    1008                 :            : 
    1009                 :          3 :         for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
    1010                 :          3 :                 init_waitqueue_head(&page_wait_table[i]);
    1011                 :            : 
    1012                 :          3 :         page_writeback_init();
    1013                 :          3 : }
    1014                 :            : 
    1015                 :            : /* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
    1016                 :            : struct wait_page_key {
    1017                 :            :         struct page *page;
    1018                 :            :         int bit_nr;
    1019                 :            :         int page_match;
    1020                 :            : };
    1021                 :            : 
    1022                 :            : struct wait_page_queue {
    1023                 :            :         struct page *page;
    1024                 :            :         int bit_nr;
    1025                 :            :         wait_queue_entry_t wait;
    1026                 :            : };
    1027                 :            : 
    1028                 :          3 : static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
    1029                 :            : {
    1030                 :            :         struct wait_page_key *key = arg;
    1031                 :            :         struct wait_page_queue *wait_page
    1032                 :            :                 = container_of(wait, struct wait_page_queue, wait);
    1033                 :            : 
    1034                 :          3 :         if (wait_page->page != key->page)
    1035                 :            :                return 0;
    1036                 :          3 :         key->page_match = 1;
    1037                 :            : 
    1038                 :          3 :         if (wait_page->bit_nr != key->bit_nr)
    1039                 :            :                 return 0;
    1040                 :            : 
    1041                 :            :         /*
    1042                 :            :          * Stop walking if it's locked.
    1043                 :            :          * Is this safe if put_and_wait_on_page_locked() is in use?
    1044                 :            :          * Yes: the waker must hold a reference to this page, and if PG_locked
    1045                 :            :          * has now already been set by another task, that task must also hold
    1046                 :            :          * a reference to the *same usage* of this page; so there is no need
    1047                 :            :          * to walk on to wake even the put_and_wait_on_page_locked() callers.
    1048                 :            :          */
    1049                 :          3 :         if (test_bit(key->bit_nr, &key->page->flags))
    1050                 :            :                 return -1;
    1051                 :            : 
    1052                 :          3 :         return autoremove_wake_function(wait, mode, sync, key);
    1053                 :            : }
    1054                 :            : 
    1055                 :          3 : static void wake_up_page_bit(struct page *page, int bit_nr)
    1056                 :            : {
    1057                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1058                 :            :         struct wait_page_key key;
    1059                 :            :         unsigned long flags;
    1060                 :            :         wait_queue_entry_t bookmark;
    1061                 :            : 
    1062                 :          3 :         key.page = page;
    1063                 :          3 :         key.bit_nr = bit_nr;
    1064                 :          3 :         key.page_match = 0;
    1065                 :            : 
    1066                 :          3 :         bookmark.flags = 0;
    1067                 :          3 :         bookmark.private = NULL;
    1068                 :          3 :         bookmark.func = NULL;
    1069                 :            :         INIT_LIST_HEAD(&bookmark.entry);
    1070                 :            : 
    1071                 :          3 :         spin_lock_irqsave(&q->lock, flags);
    1072                 :          3 :         __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1073                 :            : 
    1074                 :          3 :         while (bookmark.flags & WQ_FLAG_BOOKMARK) {
    1075                 :            :                 /*
    1076                 :            :                  * Take a breather from holding the lock,
    1077                 :            :                  * allow pages that finish wake up asynchronously
    1078                 :            :                  * to acquire the lock and remove themselves
    1079                 :            :                  * from wait queue
    1080                 :            :                  */
    1081                 :            :                 spin_unlock_irqrestore(&q->lock, flags);
    1082                 :          0 :                 cpu_relax();
    1083                 :          0 :                 spin_lock_irqsave(&q->lock, flags);
    1084                 :          0 :                 __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
    1085                 :            :         }
    1086                 :            : 
    1087                 :            :         /*
    1088                 :            :          * It is possible for other pages to have collided on the waitqueue
    1089                 :            :          * hash, so in that case check for a page match. That prevents a long-
    1090                 :            :          * term waiter
    1091                 :            :          *
    1092                 :            :          * It is still possible to miss a case here, when we woke page waiters
    1093                 :            :          * and removed them from the waitqueue, but there are still other
    1094                 :            :          * page waiters.
    1095                 :            :          */
    1096                 :          3 :         if (!waitqueue_active(q) || !key.page_match) {
    1097                 :            :                 ClearPageWaiters(page);
    1098                 :            :                 /*
    1099                 :            :                  * It's possible to miss clearing Waiters here, when we woke
    1100                 :            :                  * our page waiters, but the hashed waitqueue has waiters for
    1101                 :            :                  * other pages on it.
    1102                 :            :                  *
    1103                 :            :                  * That's okay, it's a rare case. The next waker will clear it.
    1104                 :            :                  */
    1105                 :            :         }
    1106                 :            :         spin_unlock_irqrestore(&q->lock, flags);
    1107                 :          3 : }
    1108                 :            : 
    1109                 :          3 : static void wake_up_page(struct page *page, int bit)
    1110                 :            : {
    1111                 :          3 :         if (!PageWaiters(page))
    1112                 :          3 :                 return;
    1113                 :          3 :         wake_up_page_bit(page, bit);
    1114                 :            : }
    1115                 :            : 
    1116                 :            : /*
    1117                 :            :  * A choice of three behaviors for wait_on_page_bit_common():
    1118                 :            :  */
    1119                 :            : enum behavior {
    1120                 :            :         EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
    1121                 :            :                          * __lock_page() waiting on then setting PG_locked.
    1122                 :            :                          */
    1123                 :            :         SHARED,         /* Hold ref to page and check the bit when woken, like
    1124                 :            :                          * wait_on_page_writeback() waiting on PG_writeback.
    1125                 :            :                          */
    1126                 :            :         DROP,           /* Drop ref to page before wait, no check when woken,
    1127                 :            :                          * like put_and_wait_on_page_locked() on PG_locked.
    1128                 :            :                          */
    1129                 :            : };
    1130                 :            : 
    1131                 :          3 : static inline int wait_on_page_bit_common(wait_queue_head_t *q,
    1132                 :            :         struct page *page, int bit_nr, int state, enum behavior behavior)
    1133                 :            : {
    1134                 :            :         struct wait_page_queue wait_page;
    1135                 :            :         wait_queue_entry_t *wait = &wait_page.wait;
    1136                 :            :         bool bit_is_set;
    1137                 :            :         bool thrashing = false;
    1138                 :            :         bool delayacct = false;
    1139                 :            :         unsigned long pflags;
    1140                 :            :         int ret = 0;
    1141                 :            : 
    1142                 :          3 :         if (bit_nr == PG_locked &&
    1143                 :          3 :             !PageUptodate(page) && PageWorkingset(page)) {
    1144                 :          0 :                 if (!PageSwapBacked(page)) {
    1145                 :          0 :                         delayacct_thrashing_start();
    1146                 :            :                         delayacct = true;
    1147                 :            :                 }
    1148                 :            :                 psi_memstall_enter(&pflags);
    1149                 :            :                 thrashing = true;
    1150                 :            :         }
    1151                 :            : 
    1152                 :          3 :         init_wait(wait);
    1153                 :          3 :         wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
    1154                 :          3 :         wait->func = wake_page_function;
    1155                 :          3 :         wait_page.page = page;
    1156                 :          3 :         wait_page.bit_nr = bit_nr;
    1157                 :            : 
    1158                 :            :         for (;;) {
    1159                 :            :                 spin_lock_irq(&q->lock);
    1160                 :            : 
    1161                 :          3 :                 if (likely(list_empty(&wait->entry))) {
    1162                 :            :                         __add_wait_queue_entry_tail(q, wait);
    1163                 :            :                         SetPageWaiters(page);
    1164                 :            :                 }
    1165                 :            : 
    1166                 :          3 :                 set_current_state(state);
    1167                 :            : 
    1168                 :            :                 spin_unlock_irq(&q->lock);
    1169                 :            : 
    1170                 :          3 :                 bit_is_set = test_bit(bit_nr, &page->flags);
    1171                 :          3 :                 if (behavior == DROP)
    1172                 :          0 :                         put_page(page);
    1173                 :            : 
    1174                 :          3 :                 if (likely(bit_is_set))
    1175                 :          3 :                         io_schedule();
    1176                 :            : 
    1177                 :          3 :                 if (behavior == EXCLUSIVE) {
    1178                 :          3 :                         if (!test_and_set_bit_lock(bit_nr, &page->flags))
    1179                 :            :                                 break;
    1180                 :          3 :                 } else if (behavior == SHARED) {
    1181                 :          3 :                         if (!test_bit(bit_nr, &page->flags))
    1182                 :            :                                 break;
    1183                 :            :                 }
    1184                 :            : 
    1185                 :          3 :                 if (signal_pending_state(state, current)) {
    1186                 :            :                         ret = -EINTR;
    1187                 :            :                         break;
    1188                 :            :                 }
    1189                 :            : 
    1190                 :          3 :                 if (behavior == DROP) {
    1191                 :            :                         /*
    1192                 :            :                          * We can no longer safely access page->flags:
    1193                 :            :                          * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
    1194                 :            :                          * there is a risk of waiting forever on a page reused
    1195                 :            :                          * for something that keeps it locked indefinitely.
    1196                 :            :                          * But best check for -EINTR above before breaking.
    1197                 :            :                          */
    1198                 :            :                         break;
    1199                 :            :                 }
    1200                 :            :         }
    1201                 :            : 
    1202                 :          3 :         finish_wait(q, wait);
    1203                 :            : 
    1204                 :          3 :         if (thrashing) {
    1205                 :          0 :                 if (delayacct)
    1206                 :          0 :                         delayacct_thrashing_end();
    1207                 :            :                 psi_memstall_leave(&pflags);
    1208                 :            :         }
    1209                 :            : 
    1210                 :            :         /*
    1211                 :            :          * A signal could leave PageWaiters set. Clearing it here if
    1212                 :            :          * !waitqueue_active would be possible (by open-coding finish_wait),
    1213                 :            :          * but still fail to catch it in the case of wait hash collision. We
    1214                 :            :          * already can fail to clear wait hash collision cases, so don't
    1215                 :            :          * bother with signals either.
    1216                 :            :          */
    1217                 :            : 
    1218                 :          3 :         return ret;
    1219                 :            : }
    1220                 :            : 
    1221                 :          3 : void wait_on_page_bit(struct page *page, int bit_nr)
    1222                 :            : {
    1223                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1224                 :          3 :         wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
    1225                 :          3 : }
    1226                 :            : EXPORT_SYMBOL(wait_on_page_bit);
    1227                 :            : 
    1228                 :          0 : int wait_on_page_bit_killable(struct page *page, int bit_nr)
    1229                 :            : {
    1230                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1231                 :          3 :         return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
    1232                 :            : }
    1233                 :            : EXPORT_SYMBOL(wait_on_page_bit_killable);
    1234                 :            : 
    1235                 :            : /**
    1236                 :            :  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
    1237                 :            :  * @page: The page to wait for.
    1238                 :            :  *
    1239                 :            :  * The caller should hold a reference on @page.  They expect the page to
    1240                 :            :  * become unlocked relatively soon, but do not wish to hold up migration
    1241                 :            :  * (for example) by holding the reference while waiting for the page to
    1242                 :            :  * come unlocked.  After this function returns, the caller should not
    1243                 :            :  * dereference @page.
    1244                 :            :  */
    1245                 :          0 : void put_and_wait_on_page_locked(struct page *page)
    1246                 :            : {
    1247                 :            :         wait_queue_head_t *q;
    1248                 :            : 
    1249                 :            :         page = compound_head(page);
    1250                 :            :         q = page_waitqueue(page);
    1251                 :          0 :         wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
    1252                 :          0 : }
    1253                 :            : 
    1254                 :            : /**
    1255                 :            :  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
    1256                 :            :  * @page: Page defining the wait queue of interest
    1257                 :            :  * @waiter: Waiter to add to the queue
    1258                 :            :  *
    1259                 :            :  * Add an arbitrary @waiter to the wait queue for the nominated @page.
    1260                 :            :  */
    1261                 :          0 : void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
    1262                 :            : {
    1263                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1264                 :            :         unsigned long flags;
    1265                 :            : 
    1266                 :          0 :         spin_lock_irqsave(&q->lock, flags);
    1267                 :            :         __add_wait_queue_entry_tail(q, waiter);
    1268                 :            :         SetPageWaiters(page);
    1269                 :            :         spin_unlock_irqrestore(&q->lock, flags);
    1270                 :          0 : }
    1271                 :            : EXPORT_SYMBOL_GPL(add_page_wait_queue);
    1272                 :            : 
    1273                 :            : #ifndef clear_bit_unlock_is_negative_byte
    1274                 :            : 
    1275                 :            : /*
    1276                 :            :  * PG_waiters is the high bit in the same byte as PG_lock.
    1277                 :            :  *
    1278                 :            :  * On x86 (and on many other architectures), we can clear PG_lock and
    1279                 :            :  * test the sign bit at the same time. But if the architecture does
    1280                 :            :  * not support that special operation, we just do this all by hand
    1281                 :            :  * instead.
    1282                 :            :  *
    1283                 :            :  * The read of PG_waiters has to be after (or concurrently with) PG_locked
    1284                 :            :  * being cleared, but a memory barrier should be unneccssary since it is
    1285                 :            :  * in the same byte as PG_locked.
    1286                 :            :  */
    1287                 :            : static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
    1288                 :            : {
    1289                 :            :         clear_bit_unlock(nr, mem);
    1290                 :            :         /* smp_mb__after_atomic(); */
    1291                 :            :         return test_bit(PG_waiters, mem);
    1292                 :            : }
    1293                 :            : 
    1294                 :            : #endif
    1295                 :            : 
    1296                 :            : /**
    1297                 :            :  * unlock_page - unlock a locked page
    1298                 :            :  * @page: the page
    1299                 :            :  *
    1300                 :            :  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
    1301                 :            :  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
    1302                 :            :  * mechanism between PageLocked pages and PageWriteback pages is shared.
    1303                 :            :  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
    1304                 :            :  *
    1305                 :            :  * Note that this depends on PG_waiters being the sign bit in the byte
    1306                 :            :  * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
    1307                 :            :  * clear the PG_locked bit and test PG_waiters at the same time fairly
    1308                 :            :  * portably (architectures that do LL/SC can test any bit, while x86 can
    1309                 :            :  * test the sign bit).
    1310                 :            :  */
    1311                 :          3 : void unlock_page(struct page *page)
    1312                 :            : {
    1313                 :            :         BUILD_BUG_ON(PG_waiters != 7);
    1314                 :            :         page = compound_head(page);
    1315                 :            :         VM_BUG_ON_PAGE(!PageLocked(page), page);
    1316                 :          3 :         if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
    1317                 :          3 :                 wake_up_page_bit(page, PG_locked);
    1318                 :          3 : }
    1319                 :            : EXPORT_SYMBOL(unlock_page);
    1320                 :            : 
    1321                 :            : /**
    1322                 :            :  * end_page_writeback - end writeback against a page
    1323                 :            :  * @page: the page
    1324                 :            :  */
    1325                 :          3 : void end_page_writeback(struct page *page)
    1326                 :            : {
    1327                 :            :         /*
    1328                 :            :          * TestClearPageReclaim could be used here but it is an atomic
    1329                 :            :          * operation and overkill in this particular case. Failing to
    1330                 :            :          * shuffle a page marked for immediate reclaim is too mild to
    1331                 :            :          * justify taking an atomic operation penalty at the end of
    1332                 :            :          * ever page writeback.
    1333                 :            :          */
    1334                 :          3 :         if (PageReclaim(page)) {
    1335                 :            :                 ClearPageReclaim(page);
    1336                 :          0 :                 rotate_reclaimable_page(page);
    1337                 :            :         }
    1338                 :            : 
    1339                 :          3 :         if (!test_clear_page_writeback(page))
    1340                 :          0 :                 BUG();
    1341                 :            : 
    1342                 :          3 :         smp_mb__after_atomic();
    1343                 :          3 :         wake_up_page(page, PG_writeback);
    1344                 :          3 : }
    1345                 :            : EXPORT_SYMBOL(end_page_writeback);
    1346                 :            : 
    1347                 :            : /*
    1348                 :            :  * After completing I/O on a page, call this routine to update the page
    1349                 :            :  * flags appropriately
    1350                 :            :  */
    1351                 :          3 : void page_endio(struct page *page, bool is_write, int err)
    1352                 :            : {
    1353                 :          3 :         if (!is_write) {
    1354                 :          3 :                 if (!err) {
    1355                 :            :                         SetPageUptodate(page);
    1356                 :            :                 } else {
    1357                 :            :                         ClearPageUptodate(page);
    1358                 :            :                         SetPageError(page);
    1359                 :            :                 }
    1360                 :          3 :                 unlock_page(page);
    1361                 :            :         } else {
    1362                 :          0 :                 if (err) {
    1363                 :            :                         struct address_space *mapping;
    1364                 :            : 
    1365                 :            :                         SetPageError(page);
    1366                 :          0 :                         mapping = page_mapping(page);
    1367                 :          0 :                         if (mapping)
    1368                 :          0 :                                 mapping_set_error(mapping, err);
    1369                 :            :                 }
    1370                 :          0 :                 end_page_writeback(page);
    1371                 :            :         }
    1372                 :          3 : }
    1373                 :            : EXPORT_SYMBOL_GPL(page_endio);
    1374                 :            : 
    1375                 :            : /**
    1376                 :            :  * __lock_page - get a lock on the page, assuming we need to sleep to get it
    1377                 :            :  * @__page: the page to lock
    1378                 :            :  */
    1379                 :          3 : void __lock_page(struct page *__page)
    1380                 :            : {
    1381                 :            :         struct page *page = compound_head(__page);
    1382                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1383                 :          3 :         wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
    1384                 :            :                                 EXCLUSIVE);
    1385                 :          3 : }
    1386                 :            : EXPORT_SYMBOL(__lock_page);
    1387                 :            : 
    1388                 :          3 : int __lock_page_killable(struct page *__page)
    1389                 :            : {
    1390                 :            :         struct page *page = compound_head(__page);
    1391                 :            :         wait_queue_head_t *q = page_waitqueue(page);
    1392                 :          3 :         return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
    1393                 :            :                                         EXCLUSIVE);
    1394                 :            : }
    1395                 :            : EXPORT_SYMBOL_GPL(__lock_page_killable);
    1396                 :            : 
    1397                 :            : /*
    1398                 :            :  * Return values:
    1399                 :            :  * 1 - page is locked; mmap_sem is still held.
    1400                 :            :  * 0 - page is not locked.
    1401                 :            :  *     mmap_sem has been released (up_read()), unless flags had both
    1402                 :            :  *     FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
    1403                 :            :  *     which case mmap_sem is still held.
    1404                 :            :  *
    1405                 :            :  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
    1406                 :            :  * with the page locked and the mmap_sem unperturbed.
    1407                 :            :  */
    1408                 :          0 : int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
    1409                 :            :                          unsigned int flags)
    1410                 :            : {
    1411                 :          0 :         if (flags & FAULT_FLAG_ALLOW_RETRY) {
    1412                 :            :                 /*
    1413                 :            :                  * CAUTION! In this case, mmap_sem is not released
    1414                 :            :                  * even though return 0.
    1415                 :            :                  */
    1416                 :          0 :                 if (flags & FAULT_FLAG_RETRY_NOWAIT)
    1417                 :            :                         return 0;
    1418                 :            : 
    1419                 :          0 :                 up_read(&mm->mmap_sem);
    1420                 :          0 :                 if (flags & FAULT_FLAG_KILLABLE)
    1421                 :          0 :                         wait_on_page_locked_killable(page);
    1422                 :            :                 else
    1423                 :          0 :                         wait_on_page_locked(page);
    1424                 :            :                 return 0;
    1425                 :            :         } else {
    1426                 :          0 :                 if (flags & FAULT_FLAG_KILLABLE) {
    1427                 :            :                         int ret;
    1428                 :            : 
    1429                 :          0 :                         ret = __lock_page_killable(page);
    1430                 :          0 :                         if (ret) {
    1431                 :          0 :                                 up_read(&mm->mmap_sem);
    1432                 :          0 :                                 return 0;
    1433                 :            :                         }
    1434                 :            :                 } else
    1435                 :          0 :                         __lock_page(page);
    1436                 :            :                 return 1;
    1437                 :            :         }
    1438                 :            : }
    1439                 :            : 
    1440                 :            : /**
    1441                 :            :  * page_cache_next_miss() - Find the next gap in the page cache.
    1442                 :            :  * @mapping: Mapping.
    1443                 :            :  * @index: Index.
    1444                 :            :  * @max_scan: Maximum range to search.
    1445                 :            :  *
    1446                 :            :  * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
    1447                 :            :  * gap with the lowest index.
    1448                 :            :  *
    1449                 :            :  * This function may be called under the rcu_read_lock.  However, this will
    1450                 :            :  * not atomically search a snapshot of the cache at a single point in time.
    1451                 :            :  * For example, if a gap is created at index 5, then subsequently a gap is
    1452                 :            :  * created at index 10, page_cache_next_miss covering both indices may
    1453                 :            :  * return 10 if called under the rcu_read_lock.
    1454                 :            :  *
    1455                 :            :  * Return: The index of the gap if found, otherwise an index outside the
    1456                 :            :  * range specified (in which case 'return - index >= max_scan' will be true).
    1457                 :            :  * In the rare case of index wrap-around, 0 will be returned.
    1458                 :            :  */
    1459                 :          3 : pgoff_t page_cache_next_miss(struct address_space *mapping,
    1460                 :            :                              pgoff_t index, unsigned long max_scan)
    1461                 :            : {
    1462                 :          3 :         XA_STATE(xas, &mapping->i_pages, index);
    1463                 :            : 
    1464                 :          3 :         while (max_scan--) {
    1465                 :          3 :                 void *entry = xas_next(&xas);
    1466                 :          3 :                 if (!entry || xa_is_value(entry))
    1467                 :            :                         break;
    1468                 :          3 :                 if (xas.xa_index == 0)
    1469                 :            :                         break;
    1470                 :            :         }
    1471                 :            : 
    1472                 :          3 :         return xas.xa_index;
    1473                 :            : }
    1474                 :            : EXPORT_SYMBOL(page_cache_next_miss);
    1475                 :            : 
    1476                 :            : /**
    1477                 :            :  * page_cache_prev_miss() - Find the previous gap in the page cache.
    1478                 :            :  * @mapping: Mapping.
    1479                 :            :  * @index: Index.
    1480                 :            :  * @max_scan: Maximum range to search.
    1481                 :            :  *
    1482                 :            :  * Search the range [max(index - max_scan + 1, 0), index] for the
    1483                 :            :  * gap with the highest index.
    1484                 :            :  *
    1485                 :            :  * This function may be called under the rcu_read_lock.  However, this will
    1486                 :            :  * not atomically search a snapshot of the cache at a single point in time.
    1487                 :            :  * For example, if a gap is created at index 10, then subsequently a gap is
    1488                 :            :  * created at index 5, page_cache_prev_miss() covering both indices may
    1489                 :            :  * return 5 if called under the rcu_read_lock.
    1490                 :            :  *
    1491                 :            :  * Return: The index of the gap if found, otherwise an index outside the
    1492                 :            :  * range specified (in which case 'index - return >= max_scan' will be true).
    1493                 :            :  * In the rare case of wrap-around, ULONG_MAX will be returned.
    1494                 :            :  */
    1495                 :          3 : pgoff_t page_cache_prev_miss(struct address_space *mapping,
    1496                 :            :                              pgoff_t index, unsigned long max_scan)
    1497                 :            : {
    1498                 :          3 :         XA_STATE(xas, &mapping->i_pages, index);
    1499                 :            : 
    1500                 :          3 :         while (max_scan--) {
    1501                 :          3 :                 void *entry = xas_prev(&xas);
    1502                 :          3 :                 if (!entry || xa_is_value(entry))
    1503                 :            :                         break;
    1504                 :          3 :                 if (xas.xa_index == ULONG_MAX)
    1505                 :            :                         break;
    1506                 :            :         }
    1507                 :            : 
    1508                 :          3 :         return xas.xa_index;
    1509                 :            : }
    1510                 :            : EXPORT_SYMBOL(page_cache_prev_miss);
    1511                 :            : 
    1512                 :            : /**
    1513                 :            :  * find_get_entry - find and get a page cache entry
    1514                 :            :  * @mapping: the address_space to search
    1515                 :            :  * @offset: the page cache index
    1516                 :            :  *
    1517                 :            :  * Looks up the page cache slot at @mapping & @offset.  If there is a
    1518                 :            :  * page cache page, it is returned with an increased refcount.
    1519                 :            :  *
    1520                 :            :  * If the slot holds a shadow entry of a previously evicted page, or a
    1521                 :            :  * swap entry from shmem/tmpfs, it is returned.
    1522                 :            :  *
    1523                 :            :  * Return: the found page or shadow entry, %NULL if nothing is found.
    1524                 :            :  */
    1525                 :          3 : struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
    1526                 :            : {
    1527                 :          3 :         XA_STATE(xas, &mapping->i_pages, offset);
    1528                 :            :         struct page *page;
    1529                 :            : 
    1530                 :            :         rcu_read_lock();
    1531                 :            : repeat:
    1532                 :            :         xas_reset(&xas);
    1533                 :          3 :         page = xas_load(&xas);
    1534                 :          3 :         if (xas_retry(&xas, page))
    1535                 :            :                 goto repeat;
    1536                 :            :         /*
    1537                 :            :          * A shadow entry of a recently evicted page, or a swap entry from
    1538                 :            :          * shmem/tmpfs.  Return it without attempting to raise page count.
    1539                 :            :          */
    1540                 :          3 :         if (!page || xa_is_value(page))
    1541                 :            :                 goto out;
    1542                 :            : 
    1543                 :          3 :         if (!page_cache_get_speculative(page))
    1544                 :            :                 goto repeat;
    1545                 :            : 
    1546                 :            :         /*
    1547                 :            :          * Has the page moved or been split?
    1548                 :            :          * This is part of the lockless pagecache protocol. See
    1549                 :            :          * include/linux/pagemap.h for details.
    1550                 :            :          */
    1551                 :          3 :         if (unlikely(page != xas_reload(&xas))) {
    1552                 :          3 :                 put_page(page);
    1553                 :          3 :                 goto repeat;
    1554                 :            :         }
    1555                 :          3 :         page = find_subpage(page, offset);
    1556                 :            : out:
    1557                 :            :         rcu_read_unlock();
    1558                 :            : 
    1559                 :          3 :         return page;
    1560                 :            : }
    1561                 :            : EXPORT_SYMBOL(find_get_entry);
    1562                 :            : 
    1563                 :            : /**
    1564                 :            :  * find_lock_entry - locate, pin and lock a page cache entry
    1565                 :            :  * @mapping: the address_space to search
    1566                 :            :  * @offset: the page cache index
    1567                 :            :  *
    1568                 :            :  * Looks up the page cache slot at @mapping & @offset.  If there is a
    1569                 :            :  * page cache page, it is returned locked and with an increased
    1570                 :            :  * refcount.
    1571                 :            :  *
    1572                 :            :  * If the slot holds a shadow entry of a previously evicted page, or a
    1573                 :            :  * swap entry from shmem/tmpfs, it is returned.
    1574                 :            :  *
    1575                 :            :  * find_lock_entry() may sleep.
    1576                 :            :  *
    1577                 :            :  * Return: the found page or shadow entry, %NULL if nothing is found.
    1578                 :            :  */
    1579                 :          3 : struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
    1580                 :            : {
    1581                 :            :         struct page *page;
    1582                 :            : 
    1583                 :            : repeat:
    1584                 :          3 :         page = find_get_entry(mapping, offset);
    1585                 :          3 :         if (page && !xa_is_value(page)) {
    1586                 :          3 :                 lock_page(page);
    1587                 :            :                 /* Has the page been truncated? */
    1588                 :          3 :                 if (unlikely(page_mapping(page) != mapping)) {
    1589                 :          0 :                         unlock_page(page);
    1590                 :          0 :                         put_page(page);
    1591                 :          0 :                         goto repeat;
    1592                 :            :                 }
    1593                 :            :                 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
    1594                 :            :         }
    1595                 :          3 :         return page;
    1596                 :            : }
    1597                 :            : EXPORT_SYMBOL(find_lock_entry);
    1598                 :            : 
    1599                 :            : /**
    1600                 :            :  * pagecache_get_page - find and get a page reference
    1601                 :            :  * @mapping: the address_space to search
    1602                 :            :  * @offset: the page index
    1603                 :            :  * @fgp_flags: PCG flags
    1604                 :            :  * @gfp_mask: gfp mask to use for the page cache data page allocation
    1605                 :            :  *
    1606                 :            :  * Looks up the page cache slot at @mapping & @offset.
    1607                 :            :  *
    1608                 :            :  * PCG flags modify how the page is returned.
    1609                 :            :  *
    1610                 :            :  * @fgp_flags can be:
    1611                 :            :  *
    1612                 :            :  * - FGP_ACCESSED: the page will be marked accessed
    1613                 :            :  * - FGP_LOCK: Page is return locked
    1614                 :            :  * - FGP_CREAT: If page is not present then a new page is allocated using
    1615                 :            :  *   @gfp_mask and added to the page cache and the VM's LRU
    1616                 :            :  *   list. The page is returned locked and with an increased
    1617                 :            :  *   refcount.
    1618                 :            :  * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
    1619                 :            :  *   its own locking dance if the page is already in cache, or unlock the page
    1620                 :            :  *   before returning if we had to add the page to pagecache.
    1621                 :            :  *
    1622                 :            :  * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
    1623                 :            :  * if the GFP flags specified for FGP_CREAT are atomic.
    1624                 :            :  *
    1625                 :            :  * If there is a page cache page, it is returned with an increased refcount.
    1626                 :            :  *
    1627                 :            :  * Return: the found page or %NULL otherwise.
    1628                 :            :  */
    1629                 :          3 : struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
    1630                 :            :         int fgp_flags, gfp_t gfp_mask)
    1631                 :            : {
    1632                 :            :         struct page *page;
    1633                 :            : 
    1634                 :            : repeat:
    1635                 :          3 :         page = find_get_entry(mapping, offset);
    1636                 :          3 :         if (xa_is_value(page))
    1637                 :            :                 page = NULL;
    1638                 :          3 :         if (!page)
    1639                 :            :                 goto no_page;
    1640                 :            : 
    1641                 :          3 :         if (fgp_flags & FGP_LOCK) {
    1642                 :          3 :                 if (fgp_flags & FGP_NOWAIT) {
    1643                 :          0 :                         if (!trylock_page(page)) {
    1644                 :          0 :                                 put_page(page);
    1645                 :          0 :                                 return NULL;
    1646                 :            :                         }
    1647                 :            :                 } else {
    1648                 :          3 :                         lock_page(page);
    1649                 :            :                 }
    1650                 :            : 
    1651                 :            :                 /* Has the page been truncated? */
    1652                 :          3 :                 if (unlikely(compound_head(page)->mapping != mapping)) {
    1653                 :          0 :                         unlock_page(page);
    1654                 :          0 :                         put_page(page);
    1655                 :          0 :                         goto repeat;
    1656                 :            :                 }
    1657                 :            :                 VM_BUG_ON_PAGE(page->index != offset, page);
    1658                 :            :         }
    1659                 :            : 
    1660                 :          3 :         if (fgp_flags & FGP_ACCESSED)
    1661                 :          3 :                 mark_page_accessed(page);
    1662                 :            : 
    1663                 :            : no_page:
    1664                 :          3 :         if (!page && (fgp_flags & FGP_CREAT)) {
    1665                 :            :                 int err;
    1666                 :          3 :                 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
    1667                 :          3 :                         gfp_mask |= __GFP_WRITE;
    1668                 :          3 :                 if (fgp_flags & FGP_NOFS)
    1669                 :          0 :                         gfp_mask &= ~__GFP_FS;
    1670                 :            : 
    1671                 :            :                 page = __page_cache_alloc(gfp_mask);
    1672                 :          3 :                 if (!page)
    1673                 :            :                         return NULL;
    1674                 :            : 
    1675                 :          3 :                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
    1676                 :          0 :                         fgp_flags |= FGP_LOCK;
    1677                 :            : 
    1678                 :            :                 /* Init accessed so avoid atomic mark_page_accessed later */
    1679                 :          3 :                 if (fgp_flags & FGP_ACCESSED)
    1680                 :            :                         __SetPageReferenced(page);
    1681                 :            : 
    1682                 :          3 :                 err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
    1683                 :          3 :                 if (unlikely(err)) {
    1684                 :          3 :                         put_page(page);
    1685                 :            :                         page = NULL;
    1686                 :          3 :                         if (err == -EEXIST)
    1687                 :            :                                 goto repeat;
    1688                 :            :                 }
    1689                 :            : 
    1690                 :            :                 /*
    1691                 :            :                  * add_to_page_cache_lru locks the page, and for mmap we expect
    1692                 :            :                  * an unlocked page.
    1693                 :            :                  */
    1694                 :          3 :                 if (page && (fgp_flags & FGP_FOR_MMAP))
    1695                 :          0 :                         unlock_page(page);
    1696                 :            :         }
    1697                 :            : 
    1698                 :          3 :         return page;
    1699                 :            : }
    1700                 :            : EXPORT_SYMBOL(pagecache_get_page);
    1701                 :            : 
    1702                 :            : /**
    1703                 :            :  * find_get_entries - gang pagecache lookup
    1704                 :            :  * @mapping:    The address_space to search
    1705                 :            :  * @start:      The starting page cache index
    1706                 :            :  * @nr_entries: The maximum number of entries
    1707                 :            :  * @entries:    Where the resulting entries are placed
    1708                 :            :  * @indices:    The cache indices corresponding to the entries in @entries
    1709                 :            :  *
    1710                 :            :  * find_get_entries() will search for and return a group of up to
    1711                 :            :  * @nr_entries entries in the mapping.  The entries are placed at
    1712                 :            :  * @entries.  find_get_entries() takes a reference against any actual
    1713                 :            :  * pages it returns.
    1714                 :            :  *
    1715                 :            :  * The search returns a group of mapping-contiguous page cache entries
    1716                 :            :  * with ascending indexes.  There may be holes in the indices due to
    1717                 :            :  * not-present pages.
    1718                 :            :  *
    1719                 :            :  * Any shadow entries of evicted pages, or swap entries from
    1720                 :            :  * shmem/tmpfs, are included in the returned array.
    1721                 :            :  *
    1722                 :            :  * Return: the number of pages and shadow entries which were found.
    1723                 :            :  */
    1724                 :          3 : unsigned find_get_entries(struct address_space *mapping,
    1725                 :            :                           pgoff_t start, unsigned int nr_entries,
    1726                 :            :                           struct page **entries, pgoff_t *indices)
    1727                 :            : {
    1728                 :          3 :         XA_STATE(xas, &mapping->i_pages, start);
    1729                 :            :         struct page *page;
    1730                 :            :         unsigned int ret = 0;
    1731                 :            : 
    1732                 :          3 :         if (!nr_entries)
    1733                 :            :                 return 0;
    1734                 :            : 
    1735                 :            :         rcu_read_lock();
    1736                 :          3 :         xas_for_each(&xas, page, ULONG_MAX) {
    1737                 :          3 :                 if (xas_retry(&xas, page))
    1738                 :          0 :                         continue;
    1739                 :            :                 /*
    1740                 :            :                  * A shadow entry of a recently evicted page, a swap
    1741                 :            :                  * entry from shmem/tmpfs or a DAX entry.  Return it
    1742                 :            :                  * without attempting to raise page count.
    1743                 :            :                  */
    1744                 :          3 :                 if (xa_is_value(page))
    1745                 :            :                         goto export;
    1746                 :            : 
    1747                 :          3 :                 if (!page_cache_get_speculative(page))
    1748                 :            :                         goto retry;
    1749                 :            : 
    1750                 :            :                 /* Has the page moved or been split? */
    1751                 :          3 :                 if (unlikely(page != xas_reload(&xas)))
    1752                 :            :                         goto put_page;
    1753                 :          3 :                 page = find_subpage(page, xas.xa_index);
    1754                 :            : 
    1755                 :            : export:
    1756                 :          3 :                 indices[ret] = xas.xa_index;
    1757                 :          3 :                 entries[ret] = page;
    1758                 :          3 :                 if (++ret == nr_entries)
    1759                 :            :                         break;
    1760                 :          3 :                 continue;
    1761                 :            : put_page:
    1762                 :          0 :                 put_page(page);
    1763                 :            : retry:
    1764                 :            :                 xas_reset(&xas);
    1765                 :            :         }
    1766                 :            :         rcu_read_unlock();
    1767                 :          3 :         return ret;
    1768                 :            : }
    1769                 :            : 
    1770                 :            : /**
    1771                 :            :  * find_get_pages_range - gang pagecache lookup
    1772                 :            :  * @mapping:    The address_space to search
    1773                 :            :  * @start:      The starting page index
    1774                 :            :  * @end:        The final page index (inclusive)
    1775                 :            :  * @nr_pages:   The maximum number of pages
    1776                 :            :  * @pages:      Where the resulting pages are placed
    1777                 :            :  *
    1778                 :            :  * find_get_pages_range() will search for and return a group of up to @nr_pages
    1779                 :            :  * pages in the mapping starting at index @start and up to index @end
    1780                 :            :  * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
    1781                 :            :  * a reference against the returned pages.
    1782                 :            :  *
    1783                 :            :  * The search returns a group of mapping-contiguous pages with ascending
    1784                 :            :  * indexes.  There may be holes in the indices due to not-present pages.
    1785                 :            :  * We also update @start to index the next page for the traversal.
    1786                 :            :  *
    1787                 :            :  * Return: the number of pages which were found. If this number is
    1788                 :            :  * smaller than @nr_pages, the end of specified range has been
    1789                 :            :  * reached.
    1790                 :            :  */
    1791                 :          3 : unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
    1792                 :            :                               pgoff_t end, unsigned int nr_pages,
    1793                 :            :                               struct page **pages)
    1794                 :            : {
    1795                 :          3 :         XA_STATE(xas, &mapping->i_pages, *start);
    1796                 :            :         struct page *page;
    1797                 :            :         unsigned ret = 0;
    1798                 :            : 
    1799                 :          3 :         if (unlikely(!nr_pages))
    1800                 :            :                 return 0;
    1801                 :            : 
    1802                 :            :         rcu_read_lock();
    1803                 :          3 :         xas_for_each(&xas, page, end) {
    1804                 :          3 :                 if (xas_retry(&xas, page))
    1805                 :          0 :                         continue;
    1806                 :            :                 /* Skip over shadow, swap and DAX entries */
    1807                 :          3 :                 if (xa_is_value(page))
    1808                 :          0 :                         continue;
    1809                 :            : 
    1810                 :          3 :                 if (!page_cache_get_speculative(page))
    1811                 :            :                         goto retry;
    1812                 :            : 
    1813                 :            :                 /* Has the page moved or been split? */
    1814                 :          3 :                 if (unlikely(page != xas_reload(&xas)))
    1815                 :            :                         goto put_page;
    1816                 :            : 
    1817                 :          3 :                 pages[ret] = find_subpage(page, xas.xa_index);
    1818                 :          3 :                 if (++ret == nr_pages) {
    1819                 :          3 :                         *start = xas.xa_index + 1;
    1820                 :          3 :                         goto out;
    1821                 :            :                 }
    1822                 :          3 :                 continue;
    1823                 :            : put_page:
    1824                 :          0 :                 put_page(page);
    1825                 :            : retry:
    1826                 :            :                 xas_reset(&xas);
    1827                 :            :         }
    1828                 :            : 
    1829                 :            :         /*
    1830                 :            :          * We come here when there is no page beyond @end. We take care to not
    1831                 :            :          * overflow the index @start as it confuses some of the callers. This
    1832                 :            :          * breaks the iteration when there is a page at index -1 but that is
    1833                 :            :          * already broken anyway.
    1834                 :            :          */
    1835                 :          3 :         if (end == (pgoff_t)-1)
    1836                 :          0 :                 *start = (pgoff_t)-1;
    1837                 :            :         else
    1838                 :          3 :                 *start = end + 1;
    1839                 :            : out:
    1840                 :            :         rcu_read_unlock();
    1841                 :            : 
    1842                 :          3 :         return ret;
    1843                 :            : }
    1844                 :            : 
    1845                 :            : /**
    1846                 :            :  * find_get_pages_contig - gang contiguous pagecache lookup
    1847                 :            :  * @mapping:    The address_space to search
    1848                 :            :  * @index:      The starting page index
    1849                 :            :  * @nr_pages:   The maximum number of pages
    1850                 :            :  * @pages:      Where the resulting pages are placed
    1851                 :            :  *
    1852                 :            :  * find_get_pages_contig() works exactly like find_get_pages(), except
    1853                 :            :  * that the returned number of pages are guaranteed to be contiguous.
    1854                 :            :  *
    1855                 :            :  * Return: the number of pages which were found.
    1856                 :            :  */
    1857                 :          0 : unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
    1858                 :            :                                unsigned int nr_pages, struct page **pages)
    1859                 :            : {
    1860                 :          0 :         XA_STATE(xas, &mapping->i_pages, index);
    1861                 :            :         struct page *page;
    1862                 :            :         unsigned int ret = 0;
    1863                 :            : 
    1864                 :          0 :         if (unlikely(!nr_pages))
    1865                 :            :                 return 0;
    1866                 :            : 
    1867                 :            :         rcu_read_lock();
    1868                 :          0 :         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
    1869                 :          0 :                 if (xas_retry(&xas, page))
    1870                 :          0 :                         continue;
    1871                 :            :                 /*
    1872                 :            :                  * If the entry has been swapped out, we can stop looking.
    1873                 :            :                  * No current caller is looking for DAX entries.
    1874                 :            :                  */
    1875                 :          0 :                 if (xa_is_value(page))
    1876                 :            :                         break;
    1877                 :            : 
    1878                 :          0 :                 if (!page_cache_get_speculative(page))
    1879                 :            :                         goto retry;
    1880                 :            : 
    1881                 :            :                 /* Has the page moved or been split? */
    1882                 :          0 :                 if (unlikely(page != xas_reload(&xas)))
    1883                 :            :                         goto put_page;
    1884                 :            : 
    1885                 :          0 :                 pages[ret] = find_subpage(page, xas.xa_index);
    1886                 :          0 :                 if (++ret == nr_pages)
    1887                 :            :                         break;
    1888                 :          0 :                 continue;
    1889                 :            : put_page:
    1890                 :          0 :                 put_page(page);
    1891                 :            : retry:
    1892                 :            :                 xas_reset(&xas);
    1893                 :            :         }
    1894                 :            :         rcu_read_unlock();
    1895                 :          0 :         return ret;
    1896                 :            : }
    1897                 :            : EXPORT_SYMBOL(find_get_pages_contig);
    1898                 :            : 
    1899                 :            : /**
    1900                 :            :  * find_get_pages_range_tag - find and return pages in given range matching @tag
    1901                 :            :  * @mapping:    the address_space to search
    1902                 :            :  * @index:      the starting page index
    1903                 :            :  * @end:        The final page index (inclusive)
    1904                 :            :  * @tag:        the tag index
    1905                 :            :  * @nr_pages:   the maximum number of pages
    1906                 :            :  * @pages:      where the resulting pages are placed
    1907                 :            :  *
    1908                 :            :  * Like find_get_pages, except we only return pages which are tagged with
    1909                 :            :  * @tag.   We update @index to index the next page for the traversal.
    1910                 :            :  *
    1911                 :            :  * Return: the number of pages which were found.
    1912                 :            :  */
    1913                 :          3 : unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
    1914                 :            :                         pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
    1915                 :            :                         struct page **pages)
    1916                 :            : {
    1917                 :          3 :         XA_STATE(xas, &mapping->i_pages, *index);
    1918                 :            :         struct page *page;
    1919                 :            :         unsigned ret = 0;
    1920                 :            : 
    1921                 :          3 :         if (unlikely(!nr_pages))
    1922                 :            :                 return 0;
    1923                 :            : 
    1924                 :            :         rcu_read_lock();
    1925                 :          3 :         xas_for_each_marked(&xas, page, end, tag) {
    1926                 :          3 :                 if (xas_retry(&xas, page))
    1927                 :          0 :                         continue;
    1928                 :            :                 /*
    1929                 :            :                  * Shadow entries should never be tagged, but this iteration
    1930                 :            :                  * is lockless so there is a window for page reclaim to evict
    1931                 :            :                  * a page we saw tagged.  Skip over it.
    1932                 :            :                  */
    1933                 :          3 :                 if (xa_is_value(page))
    1934                 :          0 :                         continue;
    1935                 :            : 
    1936                 :          3 :                 if (!page_cache_get_speculative(page))
    1937                 :            :                         goto retry;
    1938                 :            : 
    1939                 :            :                 /* Has the page moved or been split? */
    1940                 :          3 :                 if (unlikely(page != xas_reload(&xas)))
    1941                 :            :                         goto put_page;
    1942                 :            : 
    1943                 :          3 :                 pages[ret] = find_subpage(page, xas.xa_index);
    1944                 :          3 :                 if (++ret == nr_pages) {
    1945                 :          3 :                         *index = xas.xa_index + 1;
    1946                 :          3 :                         goto out;
    1947                 :            :                 }
    1948                 :          3 :                 continue;
    1949                 :            : put_page:
    1950                 :          0 :                 put_page(page);
    1951                 :            : retry:
    1952                 :            :                 xas_reset(&xas);
    1953                 :            :         }
    1954                 :            : 
    1955                 :            :         /*
    1956                 :            :          * We come here when we got to @end. We take care to not overflow the
    1957                 :            :          * index @index as it confuses some of the callers. This breaks the
    1958                 :            :          * iteration when there is a page at index -1 but that is already
    1959                 :            :          * broken anyway.
    1960                 :            :          */
    1961                 :          3 :         if (end == (pgoff_t)-1)
    1962                 :          3 :                 *index = (pgoff_t)-1;
    1963                 :            :         else
    1964                 :          3 :                 *index = end + 1;
    1965                 :            : out:
    1966                 :            :         rcu_read_unlock();
    1967                 :            : 
    1968                 :          3 :         return ret;
    1969                 :            : }
    1970                 :            : EXPORT_SYMBOL(find_get_pages_range_tag);
    1971                 :            : 
    1972                 :            : /*
    1973                 :            :  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
    1974                 :            :  * a _large_ part of the i/o request. Imagine the worst scenario:
    1975                 :            :  *
    1976                 :            :  *      ---R__________________________________________B__________
    1977                 :            :  *         ^ reading here                             ^ bad block(assume 4k)
    1978                 :            :  *
    1979                 :            :  * read(R) => miss => readahead(R...B) => media error => frustrating retries
    1980                 :            :  * => failing the whole request => read(R) => read(R+1) =>
    1981                 :            :  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
    1982                 :            :  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
    1983                 :            :  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
    1984                 :            :  *
    1985                 :            :  * It is going insane. Fix it by quickly scaling down the readahead size.
    1986                 :            :  */
    1987                 :            : static void shrink_readahead_size_eio(struct file *filp,
    1988                 :            :                                         struct file_ra_state *ra)
    1989                 :            : {
    1990                 :          0 :         ra->ra_pages /= 4;
    1991                 :            : }
    1992                 :            : 
    1993                 :            : /**
    1994                 :            :  * generic_file_buffered_read - generic file read routine
    1995                 :            :  * @iocb:       the iocb to read
    1996                 :            :  * @iter:       data destination
    1997                 :            :  * @written:    already copied
    1998                 :            :  *
    1999                 :            :  * This is a generic file read routine, and uses the
    2000                 :            :  * mapping->a_ops->readpage() function for the actual low-level stuff.
    2001                 :            :  *
    2002                 :            :  * This is really ugly. But the goto's actually try to clarify some
    2003                 :            :  * of the logic when it comes to error handling etc.
    2004                 :            :  *
    2005                 :            :  * Return:
    2006                 :            :  * * total number of bytes copied, including those the were already @written
    2007                 :            :  * * negative error code if nothing was copied
    2008                 :            :  */
    2009                 :          3 : static ssize_t generic_file_buffered_read(struct kiocb *iocb,
    2010                 :            :                 struct iov_iter *iter, ssize_t written)
    2011                 :            : {
    2012                 :          3 :         struct file *filp = iocb->ki_filp;
    2013                 :          3 :         struct address_space *mapping = filp->f_mapping;
    2014                 :          3 :         struct inode *inode = mapping->host;
    2015                 :          3 :         struct file_ra_state *ra = &filp->f_ra;
    2016                 :            :         loff_t *ppos = &iocb->ki_pos;
    2017                 :            :         pgoff_t index;
    2018                 :            :         pgoff_t last_index;
    2019                 :            :         pgoff_t prev_index;
    2020                 :            :         unsigned long offset;      /* offset into pagecache page */
    2021                 :            :         unsigned int prev_offset;
    2022                 :            :         int error = 0;
    2023                 :            : 
    2024                 :          3 :         if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
    2025                 :            :                 return 0;
    2026                 :          3 :         iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    2027                 :            : 
    2028                 :          3 :         index = *ppos >> PAGE_SHIFT;
    2029                 :          3 :         prev_index = ra->prev_pos >> PAGE_SHIFT;
    2030                 :          3 :         prev_offset = ra->prev_pos & (PAGE_SIZE-1);
    2031                 :          3 :         last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
    2032                 :          3 :         offset = *ppos & ~PAGE_MASK;
    2033                 :            : 
    2034                 :            :         for (;;) {
    2035                 :            :                 struct page *page;
    2036                 :            :                 pgoff_t end_index;
    2037                 :            :                 loff_t isize;
    2038                 :            :                 unsigned long nr, ret;
    2039                 :            : 
    2040                 :          3 :                 cond_resched();
    2041                 :            : find_page:
    2042                 :          3 :                 if (fatal_signal_pending(current)) {
    2043                 :            :                         error = -EINTR;
    2044                 :            :                         goto out;
    2045                 :            :                 }
    2046                 :            : 
    2047                 :            :                 page = find_get_page(mapping, index);
    2048                 :          3 :                 if (!page) {
    2049                 :          3 :                         if (iocb->ki_flags & IOCB_NOWAIT)
    2050                 :            :                                 goto would_block;
    2051                 :          3 :                         page_cache_sync_readahead(mapping,
    2052                 :            :                                         ra, filp,
    2053                 :            :                                         index, last_index - index);
    2054                 :            :                         page = find_get_page(mapping, index);
    2055                 :          3 :                         if (unlikely(page == NULL))
    2056                 :            :                                 goto no_cached_page;
    2057                 :            :                 }
    2058                 :          3 :                 if (PageReadahead(page)) {
    2059                 :          3 :                         page_cache_async_readahead(mapping,
    2060                 :            :                                         ra, filp, page,
    2061                 :            :                                         index, last_index - index);
    2062                 :            :                 }
    2063                 :          3 :                 if (!PageUptodate(page)) {
    2064                 :          3 :                         if (iocb->ki_flags & IOCB_NOWAIT) {
    2065                 :          0 :                                 put_page(page);
    2066                 :          0 :                                 goto would_block;
    2067                 :            :                         }
    2068                 :            : 
    2069                 :            :                         /*
    2070                 :            :                          * See comment in do_read_cache_page on why
    2071                 :            :                          * wait_on_page_locked is used to avoid unnecessarily
    2072                 :            :                          * serialisations and why it's safe.
    2073                 :            :                          */
    2074                 :          3 :                         error = wait_on_page_locked_killable(page);
    2075                 :          3 :                         if (unlikely(error))
    2076                 :            :                                 goto readpage_error;
    2077                 :          3 :                         if (PageUptodate(page))
    2078                 :            :                                 goto page_ok;
    2079                 :            : 
    2080                 :          3 :                         if (inode->i_blkbits == PAGE_SHIFT ||
    2081                 :          0 :                                         !mapping->a_ops->is_partially_uptodate)
    2082                 :            :                                 goto page_not_up_to_date;
    2083                 :            :                         /* pipes can't handle partially uptodate pages */
    2084                 :          0 :                         if (unlikely(iov_iter_is_pipe(iter)))
    2085                 :            :                                 goto page_not_up_to_date;
    2086                 :          0 :                         if (!trylock_page(page))
    2087                 :            :                                 goto page_not_up_to_date;
    2088                 :            :                         /* Did it get truncated before we got the lock? */
    2089                 :          0 :                         if (!page->mapping)
    2090                 :            :                                 goto page_not_up_to_date_locked;
    2091                 :          0 :                         if (!mapping->a_ops->is_partially_uptodate(page,
    2092                 :          0 :                                                         offset, iter->count))
    2093                 :            :                                 goto page_not_up_to_date_locked;
    2094                 :          0 :                         unlock_page(page);
    2095                 :            :                 }
    2096                 :            : page_ok:
    2097                 :            :                 /*
    2098                 :            :                  * i_size must be checked after we know the page is Uptodate.
    2099                 :            :                  *
    2100                 :            :                  * Checking i_size after the check allows us to calculate
    2101                 :            :                  * the correct value for "nr", which means the zero-filled
    2102                 :            :                  * part of the page is not copied back to userspace (unless
    2103                 :            :                  * another truncate extends the file - this is desired though).
    2104                 :            :                  */
    2105                 :            : 
    2106                 :            :                 isize = i_size_read(inode);
    2107                 :          3 :                 end_index = (isize - 1) >> PAGE_SHIFT;
    2108                 :          3 :                 if (unlikely(!isize || index > end_index)) {
    2109                 :          3 :                         put_page(page);
    2110                 :          3 :                         goto out;
    2111                 :            :                 }
    2112                 :            : 
    2113                 :            :                 /* nr is the maximum number of bytes to copy from this page */
    2114                 :            :                 nr = PAGE_SIZE;
    2115                 :          3 :                 if (index == end_index) {
    2116                 :          3 :                         nr = ((isize - 1) & ~PAGE_MASK) + 1;
    2117                 :          3 :                         if (nr <= offset) {
    2118                 :          3 :                                 put_page(page);
    2119                 :          3 :                                 goto out;
    2120                 :            :                         }
    2121                 :            :                 }
    2122                 :          3 :                 nr = nr - offset;
    2123                 :            : 
    2124                 :            :                 /* If users can be writing to this page using arbitrary
    2125                 :            :                  * virtual addresses, take care about potential aliasing
    2126                 :            :                  * before reading the page on the kernel side.
    2127                 :            :                  */
    2128                 :          3 :                 if (mapping_writably_mapped(mapping))
    2129                 :          2 :                         flush_dcache_page(page);
    2130                 :            : 
    2131                 :            :                 /*
    2132                 :            :                  * When a sequential read accesses a page several times,
    2133                 :            :                  * only mark it as accessed the first time.
    2134                 :            :                  */
    2135                 :          3 :                 if (prev_index != index || offset != prev_offset)
    2136                 :          3 :                         mark_page_accessed(page);
    2137                 :            :                 prev_index = index;
    2138                 :            : 
    2139                 :            :                 /*
    2140                 :            :                  * Ok, we have the page, and it's up-to-date, so
    2141                 :            :                  * now we can copy it to user space...
    2142                 :            :                  */
    2143                 :            : 
    2144                 :          3 :                 ret = copy_page_to_iter(page, offset, nr, iter);
    2145                 :          3 :                 offset += ret;
    2146                 :          3 :                 index += offset >> PAGE_SHIFT;
    2147                 :          3 :                 offset &= ~PAGE_MASK;
    2148                 :            :                 prev_offset = offset;
    2149                 :            : 
    2150                 :          3 :                 put_page(page);
    2151                 :          3 :                 written += ret;
    2152                 :          3 :                 if (!iov_iter_count(iter))
    2153                 :            :                         goto out;
    2154                 :          3 :                 if (ret < nr) {
    2155                 :            :                         error = -EFAULT;
    2156                 :            :                         goto out;
    2157                 :            :                 }
    2158                 :          3 :                 continue;
    2159                 :            : 
    2160                 :            : page_not_up_to_date:
    2161                 :            :                 /* Get exclusive access to the page ... */
    2162                 :          3 :                 error = lock_page_killable(page);
    2163                 :          3 :                 if (unlikely(error))
    2164                 :            :                         goto readpage_error;
    2165                 :            : 
    2166                 :            : page_not_up_to_date_locked:
    2167                 :            :                 /* Did it get truncated before we got the lock? */
    2168                 :          3 :                 if (!page->mapping) {
    2169                 :          0 :                         unlock_page(page);
    2170                 :          0 :                         put_page(page);
    2171                 :          0 :                         continue;
    2172                 :            :                 }
    2173                 :            : 
    2174                 :            :                 /* Did somebody else fill it already? */
    2175                 :          3 :                 if (PageUptodate(page)) {
    2176                 :          0 :                         unlock_page(page);
    2177                 :          0 :                         goto page_ok;
    2178                 :            :                 }
    2179                 :            : 
    2180                 :            : readpage:
    2181                 :            :                 /*
    2182                 :            :                  * A previous I/O error may have been due to temporary
    2183                 :            :                  * failures, eg. multipath errors.
    2184                 :            :                  * PG_error will be set again if readpage fails.
    2185                 :            :                  */
    2186                 :            :                 ClearPageError(page);
    2187                 :            :                 /* Start the actual read. The read will unlock the page. */
    2188                 :          3 :                 error = mapping->a_ops->readpage(filp, page);
    2189                 :            : 
    2190                 :          3 :                 if (unlikely(error)) {
    2191                 :          0 :                         if (error == AOP_TRUNCATED_PAGE) {
    2192                 :          0 :                                 put_page(page);
    2193                 :            :                                 error = 0;
    2194                 :          0 :                                 goto find_page;
    2195                 :            :                         }
    2196                 :            :                         goto readpage_error;
    2197                 :            :                 }
    2198                 :            : 
    2199                 :          3 :                 if (!PageUptodate(page)) {
    2200                 :          2 :                         error = lock_page_killable(page);
    2201                 :          2 :                         if (unlikely(error))
    2202                 :            :                                 goto readpage_error;
    2203                 :          2 :                         if (!PageUptodate(page)) {
    2204                 :          0 :                                 if (page->mapping == NULL) {
    2205                 :            :                                         /*
    2206                 :            :                                          * invalidate_mapping_pages got it
    2207                 :            :                                          */
    2208                 :          0 :                                         unlock_page(page);
    2209                 :          0 :                                         put_page(page);
    2210                 :          0 :                                         goto find_page;
    2211                 :            :                                 }
    2212                 :          0 :                                 unlock_page(page);
    2213                 :            :                                 shrink_readahead_size_eio(filp, ra);
    2214                 :            :                                 error = -EIO;
    2215                 :          0 :                                 goto readpage_error;
    2216                 :            :                         }
    2217                 :          2 :                         unlock_page(page);
    2218                 :            :                 }
    2219                 :            : 
    2220                 :            :                 goto page_ok;
    2221                 :            : 
    2222                 :            : readpage_error:
    2223                 :            :                 /* UHHUH! A synchronous read error occurred. Report it */
    2224                 :          3 :                 put_page(page);
    2225                 :          0 :                 goto out;
    2226                 :            : 
    2227                 :            : no_cached_page:
    2228                 :            :                 /*
    2229                 :            :                  * Ok, it wasn't cached, so we need to create a new
    2230                 :            :                  * page..
    2231                 :            :                  */
    2232                 :            :                 page = page_cache_alloc(mapping);
    2233                 :          3 :                 if (!page) {
    2234                 :            :                         error = -ENOMEM;
    2235                 :            :                         goto out;
    2236                 :            :                 }
    2237                 :          3 :                 error = add_to_page_cache_lru(page, mapping, index,
    2238                 :            :                                 mapping_gfp_constraint(mapping, GFP_KERNEL));
    2239                 :          3 :                 if (error) {
    2240                 :          0 :                         put_page(page);
    2241                 :          0 :                         if (error == -EEXIST) {
    2242                 :            :                                 error = 0;
    2243                 :            :                                 goto find_page;
    2244                 :            :                         }
    2245                 :            :                         goto out;
    2246                 :            :                 }
    2247                 :            :                 goto readpage;
    2248                 :            :         }
    2249                 :            : 
    2250                 :            : would_block:
    2251                 :            :         error = -EAGAIN;
    2252                 :            : out:
    2253                 :          3 :         ra->prev_pos = prev_index;
    2254                 :          3 :         ra->prev_pos <<= PAGE_SHIFT;
    2255                 :          3 :         ra->prev_pos |= prev_offset;
    2256                 :            : 
    2257                 :          3 :         *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
    2258                 :            :         file_accessed(filp);
    2259                 :          3 :         return written ? written : error;
    2260                 :            : }
    2261                 :            : 
    2262                 :            : /**
    2263                 :            :  * generic_file_read_iter - generic filesystem read routine
    2264                 :            :  * @iocb:       kernel I/O control block
    2265                 :            :  * @iter:       destination for the data read
    2266                 :            :  *
    2267                 :            :  * This is the "read_iter()" routine for all filesystems
    2268                 :            :  * that can use the page cache directly.
    2269                 :            :  * Return:
    2270                 :            :  * * number of bytes copied, even for partial reads
    2271                 :            :  * * negative error code if nothing was read
    2272                 :            :  */
    2273                 :            : ssize_t
    2274                 :          3 : generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
    2275                 :            : {
    2276                 :            :         size_t count = iov_iter_count(iter);
    2277                 :            :         ssize_t retval = 0;
    2278                 :            : 
    2279                 :          3 :         if (!count)
    2280                 :            :                 goto out; /* skip atime */
    2281                 :            : 
    2282                 :          3 :         if (iocb->ki_flags & IOCB_DIRECT) {
    2283                 :          0 :                 struct file *file = iocb->ki_filp;
    2284                 :          0 :                 struct address_space *mapping = file->f_mapping;
    2285                 :          0 :                 struct inode *inode = mapping->host;
    2286                 :            :                 loff_t size;
    2287                 :            : 
    2288                 :            :                 size = i_size_read(inode);
    2289                 :          0 :                 if (iocb->ki_flags & IOCB_NOWAIT) {
    2290                 :          0 :                         if (filemap_range_has_page(mapping, iocb->ki_pos,
    2291                 :          0 :                                                    iocb->ki_pos + count - 1))
    2292                 :            :                                 return -EAGAIN;
    2293                 :            :                 } else {
    2294                 :          0 :                         retval = filemap_write_and_wait_range(mapping,
    2295                 :            :                                                 iocb->ki_pos,
    2296                 :          0 :                                                 iocb->ki_pos + count - 1);
    2297                 :          0 :                         if (retval < 0)
    2298                 :            :                                 goto out;
    2299                 :            :                 }
    2300                 :            : 
    2301                 :            :                 file_accessed(file);
    2302                 :            : 
    2303                 :          0 :                 retval = mapping->a_ops->direct_IO(iocb, iter);
    2304                 :          0 :                 if (retval >= 0) {
    2305                 :          0 :                         iocb->ki_pos += retval;
    2306                 :          0 :                         count -= retval;
    2307                 :            :                 }
    2308                 :          0 :                 iov_iter_revert(iter, count - iov_iter_count(iter));
    2309                 :            : 
    2310                 :            :                 /*
    2311                 :            :                  * Btrfs can have a short DIO read if we encounter
    2312                 :            :                  * compressed extents, so if there was an error, or if
    2313                 :            :                  * we've already read everything we wanted to, or if
    2314                 :            :                  * there was a short read because we hit EOF, go ahead
    2315                 :            :                  * and return.  Otherwise fallthrough to buffered io for
    2316                 :            :                  * the rest of the read.  Buffered reads will not work for
    2317                 :            :                  * DAX files, so don't bother trying.
    2318                 :            :                  */
    2319                 :          0 :                 if (retval < 0 || !count || iocb->ki_pos >= size ||
    2320                 :            :                     IS_DAX(inode))
    2321                 :            :                         goto out;
    2322                 :            :         }
    2323                 :            : 
    2324                 :          3 :         retval = generic_file_buffered_read(iocb, iter, retval);
    2325                 :            : out:
    2326                 :          3 :         return retval;
    2327                 :            : }
    2328                 :            : EXPORT_SYMBOL(generic_file_read_iter);
    2329                 :            : 
    2330                 :            : #ifdef CONFIG_MMU
    2331                 :            : #define MMAP_LOTSAMISS  (100)
    2332                 :            : /*
    2333                 :            :  * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
    2334                 :            :  * @vmf - the vm_fault for this fault.
    2335                 :            :  * @page - the page to lock.
    2336                 :            :  * @fpin - the pointer to the file we may pin (or is already pinned).
    2337                 :            :  *
    2338                 :            :  * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
    2339                 :            :  * It differs in that it actually returns the page locked if it returns 1 and 0
    2340                 :            :  * if it couldn't lock the page.  If we did have to drop the mmap_sem then fpin
    2341                 :            :  * will point to the pinned file and needs to be fput()'ed at a later point.
    2342                 :            :  */
    2343                 :          3 : static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
    2344                 :            :                                      struct file **fpin)
    2345                 :            : {
    2346                 :          3 :         if (trylock_page(page))
    2347                 :            :                 return 1;
    2348                 :            : 
    2349                 :            :         /*
    2350                 :            :          * NOTE! This will make us return with VM_FAULT_RETRY, but with
    2351                 :            :          * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
    2352                 :            :          * is supposed to work. We have way too many special cases..
    2353                 :            :          */
    2354                 :          3 :         if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
    2355                 :            :                 return 0;
    2356                 :            : 
    2357                 :          3 :         *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
    2358                 :          3 :         if (vmf->flags & FAULT_FLAG_KILLABLE) {
    2359                 :          3 :                 if (__lock_page_killable(page)) {
    2360                 :            :                         /*
    2361                 :            :                          * We didn't have the right flags to drop the mmap_sem,
    2362                 :            :                          * but all fault_handlers only check for fatal signals
    2363                 :            :                          * if we return VM_FAULT_RETRY, so we need to drop the
    2364                 :            :                          * mmap_sem here and return 0 if we don't have a fpin.
    2365                 :            :                          */
    2366                 :          0 :                         if (*fpin == NULL)
    2367                 :          0 :                                 up_read(&vmf->vma->vm_mm->mmap_sem);
    2368                 :            :                         return 0;
    2369                 :            :                 }
    2370                 :            :         } else
    2371                 :          2 :                 __lock_page(page);
    2372                 :            :         return 1;
    2373                 :            : }
    2374                 :            : 
    2375                 :            : 
    2376                 :            : /*
    2377                 :            :  * Synchronous readahead happens when we don't even find a page in the page
    2378                 :            :  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
    2379                 :            :  * to drop the mmap sem we return the file that was pinned in order for us to do
    2380                 :            :  * that.  If we didn't pin a file then we return NULL.  The file that is
    2381                 :            :  * returned needs to be fput()'ed when we're done with it.
    2382                 :            :  */
    2383                 :          3 : static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
    2384                 :            : {
    2385                 :          3 :         struct file *file = vmf->vma->vm_file;
    2386                 :          3 :         struct file_ra_state *ra = &file->f_ra;
    2387                 :          3 :         struct address_space *mapping = file->f_mapping;
    2388                 :            :         struct file *fpin = NULL;
    2389                 :          3 :         pgoff_t offset = vmf->pgoff;
    2390                 :            : 
    2391                 :            :         /* If we don't want any read-ahead, don't bother */
    2392                 :          3 :         if (vmf->vma->vm_flags & VM_RAND_READ)
    2393                 :            :                 return fpin;
    2394                 :          3 :         if (!ra->ra_pages)
    2395                 :            :                 return fpin;
    2396                 :            : 
    2397                 :          3 :         if (vmf->vma->vm_flags & VM_SEQ_READ) {
    2398                 :          0 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    2399                 :          0 :                 page_cache_sync_readahead(mapping, ra, file, offset,
    2400                 :          0 :                                           ra->ra_pages);
    2401                 :          0 :                 return fpin;
    2402                 :            :         }
    2403                 :            : 
    2404                 :            :         /* Avoid banging the cache line if not needed */
    2405                 :          3 :         if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
    2406                 :          3 :                 ra->mmap_miss++;
    2407                 :            : 
    2408                 :            :         /*
    2409                 :            :          * Do we miss much more than hit in this file? If so,
    2410                 :            :          * stop bothering with read-ahead. It will only hurt.
    2411                 :            :          */
    2412                 :          3 :         if (ra->mmap_miss > MMAP_LOTSAMISS)
    2413                 :            :                 return fpin;
    2414                 :            : 
    2415                 :            :         /*
    2416                 :            :          * mmap read-around
    2417                 :            :          */
    2418                 :          3 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    2419                 :          3 :         ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
    2420                 :          3 :         ra->size = ra->ra_pages;
    2421                 :          3 :         ra->async_size = ra->ra_pages / 4;
    2422                 :            :         ra_submit(ra, mapping, file);
    2423                 :          3 :         return fpin;
    2424                 :            : }
    2425                 :            : 
    2426                 :            : /*
    2427                 :            :  * Asynchronous readahead happens when we find the page and PG_readahead,
    2428                 :            :  * so we want to possibly extend the readahead further.  We return the file that
    2429                 :            :  * was pinned if we have to drop the mmap_sem in order to do IO.
    2430                 :            :  */
    2431                 :          3 : static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
    2432                 :            :                                             struct page *page)
    2433                 :            : {
    2434                 :          3 :         struct file *file = vmf->vma->vm_file;
    2435                 :          3 :         struct file_ra_state *ra = &file->f_ra;
    2436                 :          3 :         struct address_space *mapping = file->f_mapping;
    2437                 :            :         struct file *fpin = NULL;
    2438                 :          3 :         pgoff_t offset = vmf->pgoff;
    2439                 :            : 
    2440                 :            :         /* If we don't want any read-ahead, don't bother */
    2441                 :          3 :         if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
    2442                 :            :                 return fpin;
    2443                 :          3 :         if (ra->mmap_miss > 0)
    2444                 :          3 :                 ra->mmap_miss--;
    2445                 :          3 :         if (PageReadahead(page)) {
    2446                 :          3 :                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    2447                 :          3 :                 page_cache_async_readahead(mapping, ra, file,
    2448                 :          3 :                                            page, offset, ra->ra_pages);
    2449                 :            :         }
    2450                 :          3 :         return fpin;
    2451                 :            : }
    2452                 :            : 
    2453                 :            : /**
    2454                 :            :  * filemap_fault - read in file data for page fault handling
    2455                 :            :  * @vmf:        struct vm_fault containing details of the fault
    2456                 :            :  *
    2457                 :            :  * filemap_fault() is invoked via the vma operations vector for a
    2458                 :            :  * mapped memory region to read in file data during a page fault.
    2459                 :            :  *
    2460                 :            :  * The goto's are kind of ugly, but this streamlines the normal case of having
    2461                 :            :  * it in the page cache, and handles the special cases reasonably without
    2462                 :            :  * having a lot of duplicated code.
    2463                 :            :  *
    2464                 :            :  * vma->vm_mm->mmap_sem must be held on entry.
    2465                 :            :  *
    2466                 :            :  * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
    2467                 :            :  * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
    2468                 :            :  *
    2469                 :            :  * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
    2470                 :            :  * has not been released.
    2471                 :            :  *
    2472                 :            :  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
    2473                 :            :  *
    2474                 :            :  * Return: bitwise-OR of %VM_FAULT_ codes.
    2475                 :            :  */
    2476                 :          3 : vm_fault_t filemap_fault(struct vm_fault *vmf)
    2477                 :            : {
    2478                 :            :         int error;
    2479                 :          3 :         struct file *file = vmf->vma->vm_file;
    2480                 :          3 :         struct file *fpin = NULL;
    2481                 :          3 :         struct address_space *mapping = file->f_mapping;
    2482                 :            :         struct file_ra_state *ra = &file->f_ra;
    2483                 :          3 :         struct inode *inode = mapping->host;
    2484                 :          3 :         pgoff_t offset = vmf->pgoff;
    2485                 :            :         pgoff_t max_off;
    2486                 :            :         struct page *page;
    2487                 :            :         vm_fault_t ret = 0;
    2488                 :            : 
    2489                 :          3 :         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    2490                 :          3 :         if (unlikely(offset >= max_off))
    2491                 :            :                 return VM_FAULT_SIGBUS;
    2492                 :            : 
    2493                 :            :         /*
    2494                 :            :          * Do we have something in the page cache already?
    2495                 :            :          */
    2496                 :            :         page = find_get_page(mapping, offset);
    2497                 :          3 :         if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
    2498                 :            :                 /*
    2499                 :            :                  * We found the page, so try async readahead before
    2500                 :            :                  * waiting for the lock.
    2501                 :            :                  */
    2502                 :          3 :                 fpin = do_async_mmap_readahead(vmf, page);
    2503                 :          3 :         } else if (!page) {
    2504                 :            :                 /* No page in the page cache at all */
    2505                 :            :                 count_vm_event(PGMAJFAULT);
    2506                 :          3 :                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
    2507                 :            :                 ret = VM_FAULT_MAJOR;
    2508                 :          3 :                 fpin = do_sync_mmap_readahead(vmf);
    2509                 :            : retry_find:
    2510                 :          3 :                 page = pagecache_get_page(mapping, offset,
    2511                 :            :                                           FGP_CREAT|FGP_FOR_MMAP,
    2512                 :            :                                           vmf->gfp_mask);
    2513                 :          3 :                 if (!page) {
    2514                 :          0 :                         if (fpin)
    2515                 :            :                                 goto out_retry;
    2516                 :            :                         return vmf_error(-ENOMEM);
    2517                 :            :                 }
    2518                 :            :         }
    2519                 :            : 
    2520                 :          3 :         if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
    2521                 :            :                 goto out_retry;
    2522                 :            : 
    2523                 :            :         /* Did it get truncated? */
    2524                 :          3 :         if (unlikely(compound_head(page)->mapping != mapping)) {
    2525                 :          0 :                 unlock_page(page);
    2526                 :          0 :                 put_page(page);
    2527                 :          0 :                 goto retry_find;
    2528                 :            :         }
    2529                 :            :         VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
    2530                 :            : 
    2531                 :            :         /*
    2532                 :            :          * We have a locked page in the page cache, now we need to check
    2533                 :            :          * that it's up-to-date. If not, it is going to be due to an error.
    2534                 :            :          */
    2535                 :          3 :         if (unlikely(!PageUptodate(page)))
    2536                 :            :                 goto page_not_uptodate;
    2537                 :            : 
    2538                 :            :         /*
    2539                 :            :          * We've made it this far and we had to drop our mmap_sem, now is the
    2540                 :            :          * time to return to the upper layer and have it re-find the vma and
    2541                 :            :          * redo the fault.
    2542                 :            :          */
    2543                 :          3 :         if (fpin) {
    2544                 :          3 :                 unlock_page(page);
    2545                 :          3 :                 goto out_retry;
    2546                 :            :         }
    2547                 :            : 
    2548                 :            :         /*
    2549                 :            :          * Found the page and have a reference on it.
    2550                 :            :          * We must recheck i_size under page lock.
    2551                 :            :          */
    2552                 :          3 :         max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
    2553                 :          3 :         if (unlikely(offset >= max_off)) {
    2554                 :          0 :                 unlock_page(page);
    2555                 :          0 :                 put_page(page);
    2556                 :          0 :                 return VM_FAULT_SIGBUS;
    2557                 :            :         }
    2558                 :            : 
    2559                 :          3 :         vmf->page = page;
    2560                 :          3 :         return ret | VM_FAULT_LOCKED;
    2561                 :            : 
    2562                 :            : page_not_uptodate:
    2563                 :            :         /*
    2564                 :            :          * Umm, take care of errors if the page isn't up-to-date.
    2565                 :            :          * Try to re-read it _once_. We do this synchronously,
    2566                 :            :          * because there really aren't any performance issues here
    2567                 :            :          * and we need to check for errors.
    2568                 :            :          */
    2569                 :            :         ClearPageError(page);
    2570                 :          0 :         fpin = maybe_unlock_mmap_for_io(vmf, fpin);
    2571                 :          0 :         error = mapping->a_ops->readpage(file, page);
    2572                 :          0 :         if (!error) {
    2573                 :          0 :                 wait_on_page_locked(page);
    2574                 :          0 :                 if (!PageUptodate(page))
    2575                 :            :                         error = -EIO;
    2576                 :            :         }
    2577                 :          0 :         if (fpin)
    2578                 :            :                 goto out_retry;
    2579                 :          0 :         put_page(page);
    2580                 :            : 
    2581                 :          0 :         if (!error || error == AOP_TRUNCATED_PAGE)
    2582                 :            :                 goto retry_find;
    2583                 :            : 
    2584                 :            :         /* Things didn't work out. Return zero to tell the mm layer so. */
    2585                 :            :         shrink_readahead_size_eio(file, ra);
    2586                 :          0 :         return VM_FAULT_SIGBUS;
    2587                 :            : 
    2588                 :            : out_retry:
    2589                 :            :         /*
    2590                 :            :          * We dropped the mmap_sem, we need to return to the fault handler to
    2591                 :            :          * re-find the vma and come back and find our hopefully still populated
    2592                 :            :          * page.
    2593                 :            :          */
    2594                 :          3 :         if (page)
    2595                 :          3 :                 put_page(page);
    2596                 :          3 :         if (fpin)
    2597                 :          3 :                 fput(fpin);
    2598                 :          3 :         return ret | VM_FAULT_RETRY;
    2599                 :            : }
    2600                 :            : EXPORT_SYMBOL(filemap_fault);
    2601                 :            : 
    2602                 :          3 : void filemap_map_pages(struct vm_fault *vmf,
    2603                 :            :                 pgoff_t start_pgoff, pgoff_t end_pgoff)
    2604                 :            : {
    2605                 :          3 :         struct file *file = vmf->vma->vm_file;
    2606                 :          3 :         struct address_space *mapping = file->f_mapping;
    2607                 :            :         pgoff_t last_pgoff = start_pgoff;
    2608                 :            :         unsigned long max_idx;
    2609                 :          3 :         XA_STATE(xas, &mapping->i_pages, start_pgoff);
    2610                 :            :         struct page *page;
    2611                 :            : 
    2612                 :            :         rcu_read_lock();
    2613                 :          3 :         xas_for_each(&xas, page, end_pgoff) {
    2614                 :          3 :                 if (xas_retry(&xas, page))
    2615                 :          0 :                         continue;
    2616                 :          3 :                 if (xa_is_value(page))
    2617                 :            :                         goto next;
    2618                 :            : 
    2619                 :            :                 /*
    2620                 :            :                  * Check for a locked page first, as a speculative
    2621                 :            :                  * reference may adversely influence page migration.
    2622                 :            :                  */
    2623                 :          3 :                 if (PageLocked(page))
    2624                 :            :                         goto next;
    2625                 :          3 :                 if (!page_cache_get_speculative(page))
    2626                 :            :                         goto next;
    2627                 :            : 
    2628                 :            :                 /* Has the page moved or been split? */
    2629                 :          3 :                 if (unlikely(page != xas_reload(&xas)))
    2630                 :            :                         goto skip;
    2631                 :          3 :                 page = find_subpage(page, xas.xa_index);
    2632                 :            : 
    2633                 :          3 :                 if (!PageUptodate(page) ||
    2634                 :            :                                 PageReadahead(page) ||
    2635                 :            :                                 PageHWPoison(page))
    2636                 :            :                         goto skip;
    2637                 :          3 :                 if (!trylock_page(page))
    2638                 :            :                         goto skip;
    2639                 :            : 
    2640                 :          3 :                 if (page->mapping != mapping || !PageUptodate(page))
    2641                 :            :                         goto unlock;
    2642                 :            : 
    2643                 :          3 :                 max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
    2644                 :          3 :                 if (page->index >= max_idx)
    2645                 :            :                         goto unlock;
    2646                 :            : 
    2647                 :          3 :                 if (file->f_ra.mmap_miss > 0)
    2648                 :          3 :                         file->f_ra.mmap_miss--;
    2649                 :            : 
    2650                 :          3 :                 vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
    2651                 :          3 :                 if (vmf->pte)
    2652                 :          3 :                         vmf->pte += xas.xa_index - last_pgoff;
    2653                 :            :                 last_pgoff = xas.xa_index;
    2654                 :          3 :                 if (alloc_set_pte(vmf, NULL, page))
    2655                 :            :                         goto unlock;
    2656                 :          3 :                 unlock_page(page);
    2657                 :          3 :                 goto next;
    2658                 :            : unlock:
    2659                 :          3 :                 unlock_page(page);
    2660                 :            : skip:
    2661                 :          3 :                 put_page(page);
    2662                 :            : next:
    2663                 :            :                 /* Huge page is mapped? No need to proceed. */
    2664                 :            :                 if (pmd_trans_huge(*vmf->pmd))
    2665                 :            :                         break;
    2666                 :            :         }
    2667                 :            :         rcu_read_unlock();
    2668                 :          3 : }
    2669                 :            : EXPORT_SYMBOL(filemap_map_pages);
    2670                 :            : 
    2671                 :          0 : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    2672                 :            : {
    2673                 :          0 :         struct page *page = vmf->page;
    2674                 :          0 :         struct inode *inode = file_inode(vmf->vma->vm_file);
    2675                 :            :         vm_fault_t ret = VM_FAULT_LOCKED;
    2676                 :            : 
    2677                 :          0 :         sb_start_pagefault(inode->i_sb);
    2678                 :          0 :         file_update_time(vmf->vma->vm_file);
    2679                 :          0 :         lock_page(page);
    2680                 :          0 :         if (page->mapping != inode->i_mapping) {
    2681                 :          0 :                 unlock_page(page);
    2682                 :            :                 ret = VM_FAULT_NOPAGE;
    2683                 :          0 :                 goto out;
    2684                 :            :         }
    2685                 :            :         /*
    2686                 :            :          * We mark the page dirty already here so that when freeze is in
    2687                 :            :          * progress, we are guaranteed that writeback during freezing will
    2688                 :            :          * see the dirty page and writeprotect it again.
    2689                 :            :          */
    2690                 :          0 :         set_page_dirty(page);
    2691                 :          0 :         wait_for_stable_page(page);
    2692                 :            : out:
    2693                 :          0 :         sb_end_pagefault(inode->i_sb);
    2694                 :          0 :         return ret;
    2695                 :            : }
    2696                 :            : 
    2697                 :            : const struct vm_operations_struct generic_file_vm_ops = {
    2698                 :            :         .fault          = filemap_fault,
    2699                 :            :         .map_pages      = filemap_map_pages,
    2700                 :            :         .page_mkwrite   = filemap_page_mkwrite,
    2701                 :            : };
    2702                 :            : 
    2703                 :            : /* This is used for a general mmap of a disk file */
    2704                 :            : 
    2705                 :          0 : int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
    2706                 :            : {
    2707                 :          0 :         struct address_space *mapping = file->f_mapping;
    2708                 :            : 
    2709                 :          0 :         if (!mapping->a_ops->readpage)
    2710                 :            :                 return -ENOEXEC;
    2711                 :            :         file_accessed(file);
    2712                 :          0 :         vma->vm_ops = &generic_file_vm_ops;
    2713                 :          0 :         return 0;
    2714                 :            : }
    2715                 :            : 
    2716                 :            : /*
    2717                 :            :  * This is for filesystems which do not implement ->writepage.
    2718                 :            :  */
    2719                 :          0 : int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
    2720                 :            : {
    2721                 :          0 :         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
    2722                 :            :                 return -EINVAL;
    2723                 :          0 :         return generic_file_mmap(file, vma);
    2724                 :            : }
    2725                 :            : #else
    2726                 :            : vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
    2727                 :            : {
    2728                 :            :         return VM_FAULT_SIGBUS;
    2729                 :            : }
    2730                 :            : int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
    2731                 :            : {
    2732                 :            :         return -ENOSYS;
    2733                 :            : }
    2734                 :            : int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
    2735                 :            : {
    2736                 :            :         return -ENOSYS;
    2737                 :            : }
    2738                 :            : #endif /* CONFIG_MMU */
    2739                 :            : 
    2740                 :            : EXPORT_SYMBOL(filemap_page_mkwrite);
    2741                 :            : EXPORT_SYMBOL(generic_file_mmap);
    2742                 :            : EXPORT_SYMBOL(generic_file_readonly_mmap);
    2743                 :            : 
    2744                 :          3 : static struct page *wait_on_page_read(struct page *page)
    2745                 :            : {
    2746                 :          3 :         if (!IS_ERR(page)) {
    2747                 :          3 :                 wait_on_page_locked(page);
    2748                 :          3 :                 if (!PageUptodate(page)) {
    2749                 :          0 :                         put_page(page);
    2750                 :            :                         page = ERR_PTR(-EIO);
    2751                 :            :                 }
    2752                 :            :         }
    2753                 :          3 :         return page;
    2754                 :            : }
    2755                 :            : 
    2756                 :          3 : static struct page *do_read_cache_page(struct address_space *mapping,
    2757                 :            :                                 pgoff_t index,
    2758                 :            :                                 int (*filler)(void *, struct page *),
    2759                 :            :                                 void *data,
    2760                 :            :                                 gfp_t gfp)
    2761                 :            : {
    2762                 :            :         struct page *page;
    2763                 :            :         int err;
    2764                 :            : repeat:
    2765                 :            :         page = find_get_page(mapping, index);
    2766                 :          3 :         if (!page) {
    2767                 :            :                 page = __page_cache_alloc(gfp);
    2768                 :          3 :                 if (!page)
    2769                 :            :                         return ERR_PTR(-ENOMEM);
    2770                 :          3 :                 err = add_to_page_cache_lru(page, mapping, index, gfp);
    2771                 :          3 :                 if (unlikely(err)) {
    2772                 :          1 :                         put_page(page);
    2773                 :          1 :                         if (err == -EEXIST)
    2774                 :            :                                 goto repeat;
    2775                 :            :                         /* Presumably ENOMEM for xarray node */
    2776                 :          0 :                         return ERR_PTR(err);
    2777                 :            :                 }
    2778                 :            : 
    2779                 :            : filler:
    2780                 :          3 :                 if (filler)
    2781                 :          0 :                         err = filler(data, page);
    2782                 :            :                 else
    2783                 :          3 :                         err = mapping->a_ops->readpage(data, page);
    2784                 :            : 
    2785                 :          3 :                 if (err < 0) {
    2786                 :          0 :                         put_page(page);
    2787                 :          0 :                         return ERR_PTR(err);
    2788                 :            :                 }
    2789                 :            : 
    2790                 :          3 :                 page = wait_on_page_read(page);
    2791                 :          3 :                 if (IS_ERR(page))
    2792                 :            :                         return page;
    2793                 :            :                 goto out;
    2794                 :            :         }
    2795                 :          3 :         if (PageUptodate(page))
    2796                 :            :                 goto out;
    2797                 :            : 
    2798                 :            :         /*
    2799                 :            :          * Page is not up to date and may be locked due one of the following
    2800                 :            :          * case a: Page is being filled and the page lock is held
    2801                 :            :          * case b: Read/write error clearing the page uptodate status
    2802                 :            :          * case c: Truncation in progress (page locked)
    2803                 :            :          * case d: Reclaim in progress
    2804                 :            :          *
    2805                 :            :          * Case a, the page will be up to date when the page is unlocked.
    2806                 :            :          *    There is no need to serialise on the page lock here as the page
    2807                 :            :          *    is pinned so the lock gives no additional protection. Even if the
    2808                 :            :          *    the page is truncated, the data is still valid if PageUptodate as
    2809                 :            :          *    it's a race vs truncate race.
    2810                 :            :          * Case b, the page will not be up to date
    2811                 :            :          * Case c, the page may be truncated but in itself, the data may still
    2812                 :            :          *    be valid after IO completes as it's a read vs truncate race. The
    2813                 :            :          *    operation must restart if the page is not uptodate on unlock but
    2814                 :            :          *    otherwise serialising on page lock to stabilise the mapping gives
    2815                 :            :          *    no additional guarantees to the caller as the page lock is
    2816                 :            :          *    released before return.
    2817                 :            :          * Case d, similar to truncation. If reclaim holds the page lock, it
    2818                 :            :          *    will be a race with remove_mapping that determines if the mapping
    2819                 :            :          *    is valid on unlock but otherwise the data is valid and there is
    2820                 :            :          *    no need to serialise with page lock.
    2821                 :            :          *
    2822                 :            :          * As the page lock gives no additional guarantee, we optimistically
    2823                 :            :          * wait on the page to be unlocked and check if it's up to date and
    2824                 :            :          * use the page if it is. Otherwise, the page lock is required to
    2825                 :            :          * distinguish between the different cases. The motivation is that we
    2826                 :            :          * avoid spurious serialisations and wakeups when multiple processes
    2827                 :            :          * wait on the same page for IO to complete.
    2828                 :            :          */
    2829                 :          1 :         wait_on_page_locked(page);
    2830                 :          1 :         if (PageUptodate(page))
    2831                 :            :                 goto out;
    2832                 :            : 
    2833                 :            :         /* Distinguish between all the cases under the safety of the lock */
    2834                 :          0 :         lock_page(page);
    2835                 :            : 
    2836                 :            :         /* Case c or d, restart the operation */
    2837                 :          0 :         if (!page->mapping) {
    2838                 :          0 :                 unlock_page(page);
    2839                 :          0 :                 put_page(page);
    2840                 :          0 :                 goto repeat;
    2841                 :            :         }
    2842                 :            : 
    2843                 :            :         /* Someone else locked and filled the page in a very small window */
    2844                 :          0 :         if (PageUptodate(page)) {
    2845                 :          0 :                 unlock_page(page);
    2846                 :          0 :                 goto out;
    2847                 :            :         }
    2848                 :            :         goto filler;
    2849                 :            : 
    2850                 :            : out:
    2851                 :          3 :         mark_page_accessed(page);
    2852                 :          3 :         return page;
    2853                 :            : }
    2854                 :            : 
    2855                 :            : /**
    2856                 :            :  * read_cache_page - read into page cache, fill it if needed
    2857                 :            :  * @mapping:    the page's address_space
    2858                 :            :  * @index:      the page index
    2859                 :            :  * @filler:     function to perform the read
    2860                 :            :  * @data:       first arg to filler(data, page) function, often left as NULL
    2861                 :            :  *
    2862                 :            :  * Read into the page cache. If a page already exists, and PageUptodate() is
    2863                 :            :  * not set, try to fill the page and wait for it to become unlocked.
    2864                 :            :  *
    2865                 :            :  * If the page does not get brought uptodate, return -EIO.
    2866                 :            :  *
    2867                 :            :  * Return: up to date page on success, ERR_PTR() on failure.
    2868                 :            :  */
    2869                 :          3 : struct page *read_cache_page(struct address_space *mapping,
    2870                 :            :                                 pgoff_t index,
    2871                 :            :                                 int (*filler)(void *, struct page *),
    2872                 :            :                                 void *data)
    2873                 :            : {
    2874                 :          3 :         return do_read_cache_page(mapping, index, filler, data,
    2875                 :            :                         mapping_gfp_mask(mapping));
    2876                 :            : }
    2877                 :            : EXPORT_SYMBOL(read_cache_page);
    2878                 :            : 
    2879                 :            : /**
    2880                 :            :  * read_cache_page_gfp - read into page cache, using specified page allocation flags.
    2881                 :            :  * @mapping:    the page's address_space
    2882                 :            :  * @index:      the page index
    2883                 :            :  * @gfp:        the page allocator flags to use if allocating
    2884                 :            :  *
    2885                 :            :  * This is the same as "read_mapping_page(mapping, index, NULL)", but with
    2886                 :            :  * any new page allocations done using the specified allocation flags.
    2887                 :            :  *
    2888                 :            :  * If the page does not get brought uptodate, return -EIO.
    2889                 :            :  *
    2890                 :            :  * Return: up to date page on success, ERR_PTR() on failure.
    2891                 :            :  */
    2892                 :          0 : struct page *read_cache_page_gfp(struct address_space *mapping,
    2893                 :            :                                 pgoff_t index,
    2894                 :            :                                 gfp_t gfp)
    2895                 :            : {
    2896                 :          0 :         return do_read_cache_page(mapping, index, NULL, NULL, gfp);
    2897                 :            : }
    2898                 :            : EXPORT_SYMBOL(read_cache_page_gfp);
    2899                 :            : 
    2900                 :            : /*
    2901                 :            :  * Don't operate on ranges the page cache doesn't support, and don't exceed the
    2902                 :            :  * LFS limits.  If pos is under the limit it becomes a short access.  If it
    2903                 :            :  * exceeds the limit we return -EFBIG.
    2904                 :            :  */
    2905                 :          3 : static int generic_write_check_limits(struct file *file, loff_t pos,
    2906                 :            :                                       loff_t *count)
    2907                 :            : {
    2908                 :          3 :         struct inode *inode = file->f_mapping->host;
    2909                 :          3 :         loff_t max_size = inode->i_sb->s_maxbytes;
    2910                 :          3 :         loff_t limit = rlimit(RLIMIT_FSIZE);
    2911                 :            : 
    2912                 :          3 :         if (limit != RLIM_INFINITY) {
    2913                 :          0 :                 if (pos >= limit) {
    2914                 :          0 :                         send_sig(SIGXFSZ, current, 0);
    2915                 :          0 :                         return -EFBIG;
    2916                 :            :                 }
    2917                 :          0 :                 *count = min(*count, limit - pos);
    2918                 :            :         }
    2919                 :            : 
    2920                 :          3 :         if (!(file->f_flags & O_LARGEFILE))
    2921                 :            :                 max_size = MAX_NON_LFS;
    2922                 :            : 
    2923                 :          3 :         if (unlikely(pos >= max_size))
    2924                 :            :                 return -EFBIG;
    2925                 :            : 
    2926                 :          3 :         *count = min(*count, max_size - pos);
    2927                 :            : 
    2928                 :          3 :         return 0;
    2929                 :            : }
    2930                 :            : 
    2931                 :            : /*
    2932                 :            :  * Performs necessary checks before doing a write
    2933                 :            :  *
    2934                 :            :  * Can adjust writing position or amount of bytes to write.
    2935                 :            :  * Returns appropriate error code that caller should return or
    2936                 :            :  * zero in case that write should be allowed.
    2937                 :            :  */
    2938                 :          3 : inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
    2939                 :            : {
    2940                 :          3 :         struct file *file = iocb->ki_filp;
    2941                 :          3 :         struct inode *inode = file->f_mapping->host;
    2942                 :            :         loff_t count;
    2943                 :            :         int ret;
    2944                 :            : 
    2945                 :          3 :         if (IS_SWAPFILE(inode))
    2946                 :            :                 return -ETXTBSY;
    2947                 :            : 
    2948                 :          3 :         if (!iov_iter_count(from))
    2949                 :            :                 return 0;
    2950                 :            : 
    2951                 :            :         /* FIXME: this is for backwards compatibility with 2.4 */
    2952                 :          3 :         if (iocb->ki_flags & IOCB_APPEND)
    2953                 :          3 :                 iocb->ki_pos = i_size_read(inode);
    2954                 :            : 
    2955                 :          3 :         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
    2956                 :            :                 return -EINVAL;
    2957                 :            : 
    2958                 :          3 :         count = iov_iter_count(from);
    2959                 :          3 :         ret = generic_write_check_limits(file, iocb->ki_pos, &count);
    2960                 :          3 :         if (ret)
    2961                 :            :                 return ret;
    2962                 :            : 
    2963                 :          3 :         iov_iter_truncate(from, count);
    2964                 :          3 :         return iov_iter_count(from);
    2965                 :            : }
    2966                 :            : EXPORT_SYMBOL(generic_write_checks);
    2967                 :            : 
    2968                 :            : /*
    2969                 :            :  * Performs necessary checks before doing a clone.
    2970                 :            :  *
    2971                 :            :  * Can adjust amount of bytes to clone via @req_count argument.
    2972                 :            :  * Returns appropriate error code that caller should return or
    2973                 :            :  * zero in case the clone should be allowed.
    2974                 :            :  */
    2975                 :          0 : int generic_remap_checks(struct file *file_in, loff_t pos_in,
    2976                 :            :                          struct file *file_out, loff_t pos_out,
    2977                 :            :                          loff_t *req_count, unsigned int remap_flags)
    2978                 :            : {
    2979                 :          0 :         struct inode *inode_in = file_in->f_mapping->host;
    2980                 :          0 :         struct inode *inode_out = file_out->f_mapping->host;
    2981                 :          0 :         uint64_t count = *req_count;
    2982                 :            :         uint64_t bcount;
    2983                 :            :         loff_t size_in, size_out;
    2984                 :          0 :         loff_t bs = inode_out->i_sb->s_blocksize;
    2985                 :            :         int ret;
    2986                 :            : 
    2987                 :            :         /* The start of both ranges must be aligned to an fs block. */
    2988                 :          0 :         if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
    2989                 :            :                 return -EINVAL;
    2990                 :            : 
    2991                 :            :         /* Ensure offsets don't wrap. */
    2992                 :          0 :         if (pos_in + count < pos_in || pos_out + count < pos_out)
    2993                 :            :                 return -EINVAL;
    2994                 :            : 
    2995                 :            :         size_in = i_size_read(inode_in);
    2996                 :            :         size_out = i_size_read(inode_out);
    2997                 :            : 
    2998                 :            :         /* Dedupe requires both ranges to be within EOF. */
    2999                 :          0 :         if ((remap_flags & REMAP_FILE_DEDUP) &&
    3000                 :          0 :             (pos_in >= size_in || pos_in + count > size_in ||
    3001                 :          0 :              pos_out >= size_out || pos_out + count > size_out))
    3002                 :            :                 return -EINVAL;
    3003                 :            : 
    3004                 :            :         /* Ensure the infile range is within the infile. */
    3005                 :          0 :         if (pos_in >= size_in)
    3006                 :            :                 return -EINVAL;
    3007                 :          0 :         count = min(count, size_in - (uint64_t)pos_in);
    3008                 :            : 
    3009                 :          0 :         ret = generic_write_check_limits(file_out, pos_out, &count);
    3010                 :          0 :         if (ret)
    3011                 :            :                 return ret;
    3012                 :            : 
    3013                 :            :         /*
    3014                 :            :          * If the user wanted us to link to the infile's EOF, round up to the
    3015                 :            :          * next block boundary for this check.
    3016                 :            :          *
    3017                 :            :          * Otherwise, make sure the count is also block-aligned, having
    3018                 :            :          * already confirmed the starting offsets' block alignment.
    3019                 :            :          */
    3020                 :          0 :         if (pos_in + count == size_in) {
    3021                 :          0 :                 bcount = ALIGN(size_in, bs) - pos_in;
    3022                 :            :         } else {
    3023                 :          0 :                 if (!IS_ALIGNED(count, bs))
    3024                 :          0 :                         count = ALIGN_DOWN(count, bs);
    3025                 :          0 :                 bcount = count;
    3026                 :            :         }
    3027                 :            : 
    3028                 :            :         /* Don't allow overlapped cloning within the same file. */
    3029                 :          0 :         if (inode_in == inode_out &&
    3030                 :          0 :             pos_out + bcount > pos_in &&
    3031                 :          0 :             pos_out < pos_in + bcount)
    3032                 :            :                 return -EINVAL;
    3033                 :            : 
    3034                 :            :         /*
    3035                 :            :          * We shortened the request but the caller can't deal with that, so
    3036                 :            :          * bounce the request back to userspace.
    3037                 :            :          */
    3038                 :          0 :         if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
    3039                 :            :                 return -EINVAL;
    3040                 :            : 
    3041                 :          0 :         *req_count = count;
    3042                 :          0 :         return 0;
    3043                 :            : }
    3044                 :            : 
    3045                 :            : 
    3046                 :            : /*
    3047                 :            :  * Performs common checks before doing a file copy/clone
    3048                 :            :  * from @file_in to @file_out.
    3049                 :            :  */
    3050                 :          0 : int generic_file_rw_checks(struct file *file_in, struct file *file_out)
    3051                 :            : {
    3052                 :            :         struct inode *inode_in = file_inode(file_in);
    3053                 :            :         struct inode *inode_out = file_inode(file_out);
    3054                 :            : 
    3055                 :            :         /* Don't copy dirs, pipes, sockets... */
    3056                 :          0 :         if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
    3057                 :            :                 return -EISDIR;
    3058                 :          0 :         if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
    3059                 :            :                 return -EINVAL;
    3060                 :            : 
    3061                 :          0 :         if (!(file_in->f_mode & FMODE_READ) ||
    3062                 :          0 :             !(file_out->f_mode & FMODE_WRITE) ||
    3063                 :          0 :             (file_out->f_flags & O_APPEND))
    3064                 :            :                 return -EBADF;
    3065                 :            : 
    3066                 :          0 :         return 0;
    3067                 :            : }
    3068                 :            : 
    3069                 :            : /*
    3070                 :            :  * Performs necessary checks before doing a file copy
    3071                 :            :  *
    3072                 :            :  * Can adjust amount of bytes to copy via @req_count argument.
    3073                 :            :  * Returns appropriate error code that caller should return or
    3074                 :            :  * zero in case the copy should be allowed.
    3075                 :            :  */
    3076                 :          0 : int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
    3077                 :            :                              struct file *file_out, loff_t pos_out,
    3078                 :            :                              size_t *req_count, unsigned int flags)
    3079                 :            : {
    3080                 :            :         struct inode *inode_in = file_inode(file_in);
    3081                 :            :         struct inode *inode_out = file_inode(file_out);
    3082                 :          0 :         uint64_t count = *req_count;
    3083                 :            :         loff_t size_in;
    3084                 :            :         int ret;
    3085                 :            : 
    3086                 :          0 :         ret = generic_file_rw_checks(file_in, file_out);
    3087                 :          0 :         if (ret)
    3088                 :            :                 return ret;
    3089                 :            : 
    3090                 :            :         /* Don't touch certain kinds of inodes */
    3091                 :          0 :         if (IS_IMMUTABLE(inode_out))
    3092                 :            :                 return -EPERM;
    3093                 :            : 
    3094                 :          0 :         if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
    3095                 :            :                 return -ETXTBSY;
    3096                 :            : 
    3097                 :            :         /* Ensure offsets don't wrap. */
    3098                 :          0 :         if (pos_in + count < pos_in || pos_out + count < pos_out)
    3099                 :            :                 return -EOVERFLOW;
    3100                 :            : 
    3101                 :            :         /* Shorten the copy to EOF */
    3102                 :            :         size_in = i_size_read(inode_in);
    3103                 :          0 :         if (pos_in >= size_in)
    3104                 :          0 :                 count = 0;
    3105                 :            :         else
    3106                 :          0 :                 count = min(count, size_in - (uint64_t)pos_in);
    3107                 :            : 
    3108                 :          0 :         ret = generic_write_check_limits(file_out, pos_out, &count);
    3109                 :          0 :         if (ret)
    3110                 :            :                 return ret;
    3111                 :            : 
    3112                 :            :         /* Don't allow overlapped copying within the same file. */
    3113                 :          0 :         if (inode_in == inode_out &&
    3114                 :          0 :             pos_out + count > pos_in &&
    3115                 :          0 :             pos_out < pos_in + count)
    3116                 :            :                 return -EINVAL;
    3117                 :            : 
    3118                 :          0 :         *req_count = count;
    3119                 :          0 :         return 0;
    3120                 :            : }
    3121                 :            : 
    3122                 :          0 : int pagecache_write_begin(struct file *file, struct address_space *mapping,
    3123                 :            :                                 loff_t pos, unsigned len, unsigned flags,
    3124                 :            :                                 struct page **pagep, void **fsdata)
    3125                 :            : {
    3126                 :          0 :         const struct address_space_operations *aops = mapping->a_ops;
    3127                 :            : 
    3128                 :          0 :         return aops->write_begin(file, mapping, pos, len, flags,
    3129                 :            :                                                         pagep, fsdata);
    3130                 :            : }
    3131                 :            : EXPORT_SYMBOL(pagecache_write_begin);
    3132                 :            : 
    3133                 :          0 : int pagecache_write_end(struct file *file, struct address_space *mapping,
    3134                 :            :                                 loff_t pos, unsigned len, unsigned copied,
    3135                 :            :                                 struct page *page, void *fsdata)
    3136                 :            : {
    3137                 :          0 :         const struct address_space_operations *aops = mapping->a_ops;
    3138                 :            : 
    3139                 :          0 :         return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
    3140                 :            : }
    3141                 :            : EXPORT_SYMBOL(pagecache_write_end);
    3142                 :            : 
    3143                 :            : ssize_t
    3144                 :          0 : generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
    3145                 :            : {
    3146                 :          0 :         struct file     *file = iocb->ki_filp;
    3147                 :          0 :         struct address_space *mapping = file->f_mapping;
    3148                 :          0 :         struct inode    *inode = mapping->host;
    3149                 :          0 :         loff_t          pos = iocb->ki_pos;
    3150                 :            :         ssize_t         written;
    3151                 :            :         size_t          write_len;
    3152                 :            :         pgoff_t         end;
    3153                 :            : 
    3154                 :            :         write_len = iov_iter_count(from);
    3155                 :          0 :         end = (pos + write_len - 1) >> PAGE_SHIFT;
    3156                 :            : 
    3157                 :          0 :         if (iocb->ki_flags & IOCB_NOWAIT) {
    3158                 :            :                 /* If there are pages to writeback, return */
    3159                 :          0 :                 if (filemap_range_has_page(inode->i_mapping, pos,
    3160                 :            :                                            pos + write_len - 1))
    3161                 :            :                         return -EAGAIN;
    3162                 :            :         } else {
    3163                 :          0 :                 written = filemap_write_and_wait_range(mapping, pos,
    3164                 :            :                                                         pos + write_len - 1);
    3165                 :          0 :                 if (written)
    3166                 :            :                         goto out;
    3167                 :            :         }
    3168                 :            : 
    3169                 :            :         /*
    3170                 :            :          * After a write we want buffered reads to be sure to go to disk to get
    3171                 :            :          * the new data.  We invalidate clean cached page from the region we're
    3172                 :            :          * about to write.  We do this *before* the write so that we can return
    3173                 :            :          * without clobbering -EIOCBQUEUED from ->direct_IO().
    3174                 :            :          */
    3175                 :          0 :         written = invalidate_inode_pages2_range(mapping,
    3176                 :          0 :                                         pos >> PAGE_SHIFT, end);
    3177                 :            :         /*
    3178                 :            :          * If a page can not be invalidated, return 0 to fall back
    3179                 :            :          * to buffered write.
    3180                 :            :          */
    3181                 :          0 :         if (written) {
    3182                 :          0 :                 if (written == -EBUSY)
    3183                 :            :                         return 0;
    3184                 :            :                 goto out;
    3185                 :            :         }
    3186                 :            : 
    3187                 :          0 :         written = mapping->a_ops->direct_IO(iocb, from);
    3188                 :            : 
    3189                 :            :         /*
    3190                 :            :          * Finally, try again to invalidate clean pages which might have been
    3191                 :            :          * cached by non-direct readahead, or faulted in by get_user_pages()
    3192                 :            :          * if the source of the write was an mmap'ed region of the file
    3193                 :            :          * we're writing.  Either one is a pretty crazy thing to do,
    3194                 :            :          * so we don't support it 100%.  If this invalidation
    3195                 :            :          * fails, tough, the write still worked...
    3196                 :            :          *
    3197                 :            :          * Most of the time we do not need this since dio_complete() will do
    3198                 :            :          * the invalidation for us. However there are some file systems that
    3199                 :            :          * do not end up with dio_complete() being called, so let's not break
    3200                 :            :          * them by removing it completely
    3201                 :            :          */
    3202                 :          0 :         if (mapping->nrpages)
    3203                 :          0 :                 invalidate_inode_pages2_range(mapping,
    3204                 :            :                                         pos >> PAGE_SHIFT, end);
    3205                 :            : 
    3206                 :          0 :         if (written > 0) {
    3207                 :          0 :                 pos += written;
    3208                 :          0 :                 write_len -= written;
    3209                 :          0 :                 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
    3210                 :            :                         i_size_write(inode, pos);
    3211                 :            :                         mark_inode_dirty(inode);
    3212                 :            :                 }
    3213                 :          0 :                 iocb->ki_pos = pos;
    3214                 :            :         }
    3215                 :          0 :         iov_iter_revert(from, write_len - iov_iter_count(from));
    3216                 :            : out:
    3217                 :          0 :         return written;
    3218                 :            : }
    3219                 :            : EXPORT_SYMBOL(generic_file_direct_write);
    3220                 :            : 
    3221                 :            : /*
    3222                 :            :  * Find or create a page at the given pagecache position. Return the locked
    3223                 :            :  * page. This function is specifically for buffered writes.
    3224                 :            :  */
    3225                 :          3 : struct page *grab_cache_page_write_begin(struct address_space *mapping,
    3226                 :            :                                         pgoff_t index, unsigned flags)
    3227                 :            : {
    3228                 :            :         struct page *page;
    3229                 :            :         int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
    3230                 :            : 
    3231                 :          3 :         if (flags & AOP_FLAG_NOFS)
    3232                 :            :                 fgp_flags |= FGP_NOFS;
    3233                 :            : 
    3234                 :          3 :         page = pagecache_get_page(mapping, index, fgp_flags,
    3235                 :            :                         mapping_gfp_mask(mapping));
    3236                 :          3 :         if (page)
    3237                 :          3 :                 wait_for_stable_page(page);
    3238                 :            : 
    3239                 :          3 :         return page;
    3240                 :            : }
    3241                 :            : EXPORT_SYMBOL(grab_cache_page_write_begin);
    3242                 :            : 
    3243                 :          3 : ssize_t generic_perform_write(struct file *file,
    3244                 :            :                                 struct iov_iter *i, loff_t pos)
    3245                 :            : {
    3246                 :          3 :         struct address_space *mapping = file->f_mapping;
    3247                 :          3 :         const struct address_space_operations *a_ops = mapping->a_ops;
    3248                 :            :         long status = 0;
    3249                 :            :         ssize_t written = 0;
    3250                 :            :         unsigned int flags = 0;
    3251                 :            : 
    3252                 :            :         do {
    3253                 :            :                 struct page *page;
    3254                 :            :                 unsigned long offset;   /* Offset into pagecache page */
    3255                 :            :                 unsigned long bytes;    /* Bytes to write to page */
    3256                 :            :                 size_t copied;          /* Bytes copied from user */
    3257                 :            :                 void *fsdata;
    3258                 :            : 
    3259                 :          3 :                 offset = (pos & (PAGE_SIZE - 1));
    3260                 :          3 :                 bytes = min_t(unsigned long, PAGE_SIZE - offset,
    3261                 :            :                                                 iov_iter_count(i));
    3262                 :            : 
    3263                 :            : again:
    3264                 :            :                 /*
    3265                 :            :                  * Bring in the user page that we will copy from _first_.
    3266                 :            :                  * Otherwise there's a nasty deadlock on copying from the
    3267                 :            :                  * same page as we're writing to, without it being marked
    3268                 :            :                  * up-to-date.
    3269                 :            :                  *
    3270                 :            :                  * Not only is this an optimisation, but it is also required
    3271                 :            :                  * to check that the address is actually valid, when atomic
    3272                 :            :                  * usercopies are used, below.
    3273                 :            :                  */
    3274                 :          3 :                 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
    3275                 :            :                         status = -EFAULT;
    3276                 :          0 :                         break;
    3277                 :            :                 }
    3278                 :            : 
    3279                 :          3 :                 if (fatal_signal_pending(current)) {
    3280                 :            :                         status = -EINTR;
    3281                 :            :                         break;
    3282                 :            :                 }
    3283                 :            : 
    3284                 :          3 :                 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
    3285                 :            :                                                 &page, &fsdata);
    3286                 :          3 :                 if (unlikely(status < 0))
    3287                 :            :                         break;
    3288                 :            : 
    3289                 :          3 :                 if (mapping_writably_mapped(mapping))
    3290                 :          0 :                         flush_dcache_page(page);
    3291                 :            : 
    3292                 :          3 :                 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
    3293                 :          3 :                 flush_dcache_page(page);
    3294                 :            : 
    3295                 :          3 :                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
    3296                 :            :                                                 page, fsdata);
    3297                 :          3 :                 if (unlikely(status < 0))
    3298                 :            :                         break;
    3299                 :          3 :                 copied = status;
    3300                 :            : 
    3301                 :          3 :                 cond_resched();
    3302                 :            : 
    3303                 :          3 :                 iov_iter_advance(i, copied);
    3304                 :          3 :                 if (unlikely(copied == 0)) {
    3305                 :            :                         /*
    3306                 :            :                          * If we were unable to copy any data at all, we must
    3307                 :            :                          * fall back to a single segment length write.
    3308                 :            :                          *
    3309                 :            :                          * If we didn't fallback here, we could livelock
    3310                 :            :                          * because not all segments in the iov can be copied at
    3311                 :            :                          * once without a pagefault.
    3312                 :            :                          */
    3313                 :          0 :                         bytes = min_t(unsigned long, PAGE_SIZE - offset,
    3314                 :            :                                                 iov_iter_single_seg_count(i));
    3315                 :          0 :                         goto again;
    3316                 :            :                 }
    3317                 :          3 :                 pos += copied;
    3318                 :          3 :                 written += copied;
    3319                 :            : 
    3320                 :          3 :                 balance_dirty_pages_ratelimited(mapping);
    3321                 :          3 :         } while (iov_iter_count(i));
    3322                 :            : 
    3323                 :          3 :         return written ? written : status;
    3324                 :            : }
    3325                 :            : EXPORT_SYMBOL(generic_perform_write);
    3326                 :            : 
    3327                 :            : /**
    3328                 :            :  * __generic_file_write_iter - write data to a file
    3329                 :            :  * @iocb:       IO state structure (file, offset, etc.)
    3330                 :            :  * @from:       iov_iter with data to write
    3331                 :            :  *
    3332                 :            :  * This function does all the work needed for actually writing data to a
    3333                 :            :  * file. It does all basic checks, removes SUID from the file, updates
    3334                 :            :  * modification times and calls proper subroutines depending on whether we
    3335                 :            :  * do direct IO or a standard buffered write.
    3336                 :            :  *
    3337                 :            :  * It expects i_mutex to be grabbed unless we work on a block device or similar
    3338                 :            :  * object which does not need locking at all.
    3339                 :            :  *
    3340                 :            :  * This function does *not* take care of syncing data in case of O_SYNC write.
    3341                 :            :  * A caller has to handle it. This is mainly due to the fact that we want to
    3342                 :            :  * avoid syncing under i_mutex.
    3343                 :            :  *
    3344                 :            :  * Return:
    3345                 :            :  * * number of bytes written, even for truncated writes
    3346                 :            :  * * negative error code if no data has been written at all
    3347                 :            :  */
    3348                 :          3 : ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    3349                 :            : {
    3350                 :          3 :         struct file *file = iocb->ki_filp;
    3351                 :          3 :         struct address_space * mapping = file->f_mapping;
    3352                 :          3 :         struct inode    *inode = mapping->host;
    3353                 :            :         ssize_t         written = 0;
    3354                 :            :         ssize_t         err;
    3355                 :            :         ssize_t         status;
    3356                 :            : 
    3357                 :            :         /* We can write back this queue in page reclaim */
    3358                 :          3 :         current->backing_dev_info = inode_to_bdi(inode);
    3359                 :          3 :         err = file_remove_privs(file);
    3360                 :          3 :         if (err)
    3361                 :            :                 goto out;
    3362                 :            : 
    3363                 :          3 :         err = file_update_time(file);
    3364                 :          3 :         if (err)
    3365                 :            :                 goto out;
    3366                 :            : 
    3367                 :          3 :         if (iocb->ki_flags & IOCB_DIRECT) {
    3368                 :            :                 loff_t pos, endbyte;
    3369                 :            : 
    3370                 :          0 :                 written = generic_file_direct_write(iocb, from);
    3371                 :            :                 /*
    3372                 :            :                  * If the write stopped short of completing, fall back to
    3373                 :            :                  * buffered writes.  Some filesystems do this for writes to
    3374                 :            :                  * holes, for example.  For DAX files, a buffered write will
    3375                 :            :                  * not succeed (even if it did, DAX does not handle dirty
    3376                 :            :                  * page-cache pages correctly).
    3377                 :            :                  */
    3378                 :          0 :                 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
    3379                 :            :                         goto out;
    3380                 :            : 
    3381                 :          0 :                 status = generic_perform_write(file, from, pos = iocb->ki_pos);
    3382                 :            :                 /*
    3383                 :            :                  * If generic_perform_write() returned a synchronous error
    3384                 :            :                  * then we want to return the number of bytes which were
    3385                 :            :                  * direct-written, or the error code if that was zero.  Note
    3386                 :            :                  * that this differs from normal direct-io semantics, which
    3387                 :            :                  * will return -EFOO even if some bytes were written.
    3388                 :            :                  */
    3389                 :          0 :                 if (unlikely(status < 0)) {
    3390                 :            :                         err = status;
    3391                 :            :                         goto out;
    3392                 :            :                 }
    3393                 :            :                 /*
    3394                 :            :                  * We need to ensure that the page cache pages are written to
    3395                 :            :                  * disk and invalidated to preserve the expected O_DIRECT
    3396                 :            :                  * semantics.
    3397                 :            :                  */
    3398                 :          0 :                 endbyte = pos + status - 1;
    3399                 :          0 :                 err = filemap_write_and_wait_range(mapping, pos, endbyte);
    3400                 :          0 :                 if (err == 0) {
    3401                 :          0 :                         iocb->ki_pos = endbyte + 1;
    3402                 :          0 :                         written += status;
    3403                 :          0 :                         invalidate_mapping_pages(mapping,
    3404                 :          0 :                                                  pos >> PAGE_SHIFT,
    3405                 :          0 :                                                  endbyte >> PAGE_SHIFT);
    3406                 :            :                 } else {
    3407                 :            :                         /*
    3408                 :            :                          * We don't know how much we wrote, so just return
    3409                 :            :                          * the number of bytes which were direct-written
    3410                 :            :                          */
    3411                 :            :                 }
    3412                 :            :         } else {
    3413                 :          3 :                 written = generic_perform_write(file, from, iocb->ki_pos);
    3414                 :          3 :                 if (likely(written > 0))
    3415                 :          3 :                         iocb->ki_pos += written;
    3416                 :            :         }
    3417                 :            : out:
    3418                 :          3 :         current->backing_dev_info = NULL;
    3419                 :          3 :         return written ? written : err;
    3420                 :            : }
    3421                 :            : EXPORT_SYMBOL(__generic_file_write_iter);
    3422                 :            : 
    3423                 :            : /**
    3424                 :            :  * generic_file_write_iter - write data to a file
    3425                 :            :  * @iocb:       IO state structure
    3426                 :            :  * @from:       iov_iter with data to write
    3427                 :            :  *
    3428                 :            :  * This is a wrapper around __generic_file_write_iter() to be used by most
    3429                 :            :  * filesystems. It takes care of syncing the file in case of O_SYNC file
    3430                 :            :  * and acquires i_mutex as needed.
    3431                 :            :  * Return:
    3432                 :            :  * * negative error code if no data has been written at all of
    3433                 :            :  *   vfs_fsync_range() failed for a synchronous write
    3434                 :            :  * * number of bytes written, even for truncated writes
    3435                 :            :  */
    3436                 :          3 : ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
    3437                 :            : {
    3438                 :          3 :         struct file *file = iocb->ki_filp;
    3439                 :          3 :         struct inode *inode = file->f_mapping->host;
    3440                 :            :         ssize_t ret;
    3441                 :            : 
    3442                 :            :         inode_lock(inode);
    3443                 :          3 :         ret = generic_write_checks(iocb, from);
    3444                 :          3 :         if (ret > 0)
    3445                 :          3 :                 ret = __generic_file_write_iter(iocb, from);
    3446                 :            :         inode_unlock(inode);
    3447                 :            : 
    3448                 :          3 :         if (ret > 0)
    3449                 :          3 :                 ret = generic_write_sync(iocb, ret);
    3450                 :          3 :         return ret;
    3451                 :            : }
    3452                 :            : EXPORT_SYMBOL(generic_file_write_iter);
    3453                 :            : 
    3454                 :            : /**
    3455                 :            :  * try_to_release_page() - release old fs-specific metadata on a page
    3456                 :            :  *
    3457                 :            :  * @page: the page which the kernel is trying to free
    3458                 :            :  * @gfp_mask: memory allocation flags (and I/O mode)
    3459                 :            :  *
    3460                 :            :  * The address_space is to try to release any data against the page
    3461                 :            :  * (presumably at page->private).
    3462                 :            :  *
    3463                 :            :  * This may also be called if PG_fscache is set on a page, indicating that the
    3464                 :            :  * page is known to the local caching routines.
    3465                 :            :  *
    3466                 :            :  * The @gfp_mask argument specifies whether I/O may be performed to release
    3467                 :            :  * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
    3468                 :            :  *
    3469                 :            :  * Return: %1 if the release was successful, otherwise return zero.
    3470                 :            :  */
    3471                 :          3 : int try_to_release_page(struct page *page, gfp_t gfp_mask)
    3472                 :            : {
    3473                 :          3 :         struct address_space * const mapping = page->mapping;
    3474                 :            : 
    3475                 :          3 :         BUG_ON(!PageLocked(page));
    3476                 :          3 :         if (PageWriteback(page))
    3477                 :            :                 return 0;
    3478                 :            : 
    3479                 :          3 :         if (mapping && mapping->a_ops->releasepage)
    3480                 :          3 :                 return mapping->a_ops->releasepage(page, gfp_mask);
    3481                 :          0 :         return try_to_free_buffers(page);
    3482                 :            : }
    3483                 :            : 
    3484                 :            : EXPORT_SYMBOL(try_to_release_page);

Generated by: LCOV version 1.14