LCOV - Real - mm/oom

LCOV - code coverage report

Current view:	top level - mm - oom_kill.c (source / functions)		Hit	Total	Coverage
Test:	Real	Lines:	14	285	4.9 %
Date:	2020-10-17 15:46:43	Functions:	1	30	3.3 %
Legend:	Neither, QEMU, Real, Both	Branches:	0	0	-

           Branch data     Line data    Source code

       1                 :            : // SPDX-License-Identifier: GPL-2.0-only
       2                 :            : /*
       3                 :            :  *  linux/mm/oom_kill.c
       4                 :            :  * 
       5                 :            :  *  Copyright (C)  1998,2000  Rik van Riel
       6                 :            :  *      Thanks go out to Claus Fischer for some serious inspiration and
       7                 :            :  *      for goading me into coding this file...
       8                 :            :  *  Copyright (C)  2010  Google, Inc.
       9                 :            :  *      Rewritten by David Rientjes
      10                 :            :  *
      11                 :            :  *  The routines in this file are used to kill a process when
      12                 :            :  *  we're seriously out of memory. This gets called from __alloc_pages()
      13                 :            :  *  in mm/page_alloc.c when we really run out of memory.
      14                 :            :  *
      15                 :            :  *  Since we won't call these routines often (on a well-configured
      16                 :            :  *  machine) this file will double as a 'coding guide' and a signpost
      17                 :            :  *  for newbie kernel hackers. It features several pointers to major
      18                 :            :  *  kernel subsystems and hints as to where to find out what things do.
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <linux/oom.h>
      22                 :            : #include <linux/mm.h>
      23                 :            : #include <linux/err.h>
      24                 :            : #include <linux/gfp.h>
      25                 :            : #include <linux/sched.h>
      26                 :            : #include <linux/sched/mm.h>
      27                 :            : #include <linux/sched/coredump.h>
      28                 :            : #include <linux/sched/task.h>
      29                 :            : #include <linux/swap.h>
      30                 :            : #include <linux/timex.h>
      31                 :            : #include <linux/jiffies.h>
      32                 :            : #include <linux/cpuset.h>
      33                 :            : #include <linux/export.h>
      34                 :            : #include <linux/notifier.h>
      35                 :            : #include <linux/memcontrol.h>
      36                 :            : #include <linux/mempolicy.h>
      37                 :            : #include <linux/security.h>
      38                 :            : #include <linux/ptrace.h>
      39                 :            : #include <linux/freezer.h>
      40                 :            : #include <linux/ftrace.h>
      41                 :            : #include <linux/ratelimit.h>
      42                 :            : #include <linux/kthread.h>
      43                 :            : #include <linux/init.h>
      44                 :            : #include <linux/mmu_notifier.h>
      45                 :            : 
      46                 :            : #include <asm/tlb.h>
      47                 :            : #include "internal.h"
      48                 :            : #include "slab.h"
      49                 :            : 
      50                 :            : #define CREATE_TRACE_POINTS
      51                 :            : #include <trace/events/oom.h>
      52                 :            : 
      53                 :            : int sysctl_panic_on_oom;
      54                 :            : int sysctl_oom_kill_allocating_task;
      55                 :            : int sysctl_oom_dump_tasks = 1;
      56                 :            : 
      57                 :            : /*
      58                 :            :  * Serializes oom killer invocations (out_of_memory()) from all contexts to
      59                 :            :  * prevent from over eager oom killing (e.g. when the oom killer is invoked
      60                 :            :  * from different domains).
      61                 :            :  *
      62                 :            :  * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
      63                 :            :  * and mark_oom_victim
      64                 :            :  */
      65                 :            : DEFINE_MUTEX(oom_lock);
      66                 :            : 
      67                 :            : static inline bool is_memcg_oom(struct oom_control *oc)
      68                 :            : {
      69                 :          0 :         return oc->memcg != NULL;
      70                 :            : }
      71                 :            : 
      72                 :            : #ifdef CONFIG_NUMA
      73                 :            : /**
      74                 :            :  * oom_cpuset_eligible() - check task eligiblity for kill
      75                 :            :  * @start: task struct of which task to consider
      76                 :            :  * @oc: pointer to struct oom_control
      77                 :            :  *
      78                 :            :  * Task eligibility is determined by whether or not a candidate task, @tsk,
      79                 :            :  * shares the same mempolicy nodes as current if it is bound by such a policy
      80                 :            :  * and whether or not it has the same set of allowed cpuset nodes.
      81                 :            :  *
      82                 :            :  * This function is assuming oom-killer context and 'current' has triggered
      83                 :            :  * the oom-killer.
      84                 :            :  */
      85                 :            : static bool oom_cpuset_eligible(struct task_struct *start,
      86                 :            :                                 struct oom_control *oc)
      87                 :            : {
      88                 :            :         struct task_struct *tsk;
      89                 :            :         bool ret = false;
      90                 :            :         const nodemask_t *mask = oc->nodemask;
      91                 :            : 
      92                 :            :         if (is_memcg_oom(oc))
      93                 :            :                 return true;
      94                 :            : 
      95                 :            :         rcu_read_lock();
      96                 :            :         for_each_thread(start, tsk) {
      97                 :            :                 if (mask) {
      98                 :            :                         /*
      99                 :            :                          * If this is a mempolicy constrained oom, tsk's
     100                 :            :                          * cpuset is irrelevant.  Only return true if its
     101                 :            :                          * mempolicy intersects current, otherwise it may be
     102                 :            :                          * needlessly killed.
     103                 :            :                          */
     104                 :            :                         ret = mempolicy_nodemask_intersects(tsk, mask);
     105                 :            :                 } else {
     106                 :            :                         /*
     107                 :            :                          * This is not a mempolicy constrained oom, so only
     108                 :            :                          * check the mems of tsk's cpuset.
     109                 :            :                          */
     110                 :            :                         ret = cpuset_mems_allowed_intersects(current, tsk);
     111                 :            :                 }
     112                 :            :                 if (ret)
     113                 :            :                         break;
     114                 :            :         }
     115                 :            :         rcu_read_unlock();
     116                 :            : 
     117                 :            :         return ret;
     118                 :            : }
     119                 :            : #else
     120                 :            : static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
     121                 :            : {
     122                 :            :         return true;
     123                 :            : }
     124                 :            : #endif /* CONFIG_NUMA */
     125                 :            : 
     126                 :            : /*
     127                 :            :  * The process p may have detached its own ->mm while exiting or through
     128                 :            :  * use_mm(), but one or more of its subthreads may still have a valid
     129                 :            :  * pointer.  Return p, or any of its subthreads with a valid ->mm, with
     130                 :            :  * task_lock() held.
     131                 :            :  */
     132                 :          3 : struct task_struct *find_lock_task_mm(struct task_struct *p)
     133                 :            : {
     134                 :            :         struct task_struct *t;
     135                 :            : 
     136                 :            :         rcu_read_lock();
     137                 :            : 
     138                 :          3 :         for_each_thread(p, t) {
     139                 :            :                 task_lock(t);
     140                 :          3 :                 if (likely(t->mm))
     141                 :            :                         goto found;
     142                 :            :                 task_unlock(t);
     143                 :            :         }
     144                 :            :         t = NULL;
     145                 :            : found:
     146                 :            :         rcu_read_unlock();
     147                 :            : 
     148                 :          3 :         return t;
     149                 :            : }
     150                 :            : 
     151                 :            : /*
     152                 :            :  * order == -1 means the oom kill is required by sysrq, otherwise only
     153                 :            :  * for display purposes.
     154                 :            :  */
     155                 :            : static inline bool is_sysrq_oom(struct oom_control *oc)
     156                 :            : {
     157                 :          0 :         return oc->order == -1;
     158                 :            : }
     159                 :            : 
     160                 :            : /* return true if the task is not adequate as candidate victim task. */
     161                 :            : static bool oom_unkillable_task(struct task_struct *p)
     162                 :            : {
     163                 :          0 :         if (is_global_init(p))
     164                 :            :                 return true;
     165                 :          0 :         if (p->flags & PF_KTHREAD)
     166                 :            :                 return true;
     167                 :            :         return false;
     168                 :            : }
     169                 :            : 
     170                 :            : /*
     171                 :            :  * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
     172                 :            :  * than all user memory (LRU pages)
     173                 :            :  */
     174                 :          0 : static bool is_dump_unreclaim_slabs(void)
     175                 :            : {
     176                 :            :         unsigned long nr_lru;
     177                 :            : 
     178                 :          0 :         nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
     179                 :          0 :                  global_node_page_state(NR_INACTIVE_ANON) +
     180                 :          0 :                  global_node_page_state(NR_ACTIVE_FILE) +
     181                 :          0 :                  global_node_page_state(NR_INACTIVE_FILE) +
     182                 :          0 :                  global_node_page_state(NR_ISOLATED_ANON) +
     183                 :            :                  global_node_page_state(NR_ISOLATED_FILE) +
     184                 :            :                  global_node_page_state(NR_UNEVICTABLE);
     185                 :            : 
     186                 :          0 :         return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
     187                 :            : }
     188                 :            : 
     189                 :            : /**
     190                 :            :  * oom_badness - heuristic function to determine which candidate task to kill
     191                 :            :  * @p: task struct of which task we should calculate
     192                 :            :  * @totalpages: total present RAM allowed for page allocation
     193                 :            :  *
     194                 :            :  * The heuristic for determining which task to kill is made to be as simple and
     195                 :            :  * predictable as possible.  The goal is to return the highest value for the
     196                 :            :  * task consuming the most memory to avoid subsequent oom failures.
     197                 :            :  */
     198                 :          0 : unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
     199                 :            : {
     200                 :            :         long points;
     201                 :            :         long adj;
     202                 :            : 
     203                 :          0 :         if (oom_unkillable_task(p))
     204                 :            :                 return 0;
     205                 :            : 
     206                 :          0 :         p = find_lock_task_mm(p);
     207                 :          0 :         if (!p)
     208                 :            :                 return 0;
     209                 :            : 
     210                 :            :         /*
     211                 :            :          * Do not even consider tasks which are explicitly marked oom
     212                 :            :          * unkillable or have been already oom reaped or the are in
     213                 :            :          * the middle of vfork
     214                 :            :          */
     215                 :          0 :         adj = (long)p->signal->oom_score_adj;
     216                 :          0 :         if (adj == OOM_SCORE_ADJ_MIN ||
     217                 :          0 :                         test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
     218                 :            :                         in_vfork(p)) {
     219                 :            :                 task_unlock(p);
     220                 :          0 :                 return 0;
     221                 :            :         }
     222                 :            : 
     223                 :            :         /*
     224                 :            :          * The baseline for the badness score is the proportion of RAM that each
     225                 :            :          * task's rss, pagetable and swap space use.
     226                 :            :          */
     227                 :          0 :         points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
     228                 :          0 :                 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
     229                 :            :         task_unlock(p);
     230                 :            : 
     231                 :            :         /* Normalize to oom_score_adj units */
     232                 :          0 :         adj *= totalpages / 1000;
     233                 :          0 :         points += adj;
     234                 :            : 
     235                 :            :         /*
     236                 :            :          * Never return 0 for an eligible task regardless of the root bonus and
     237                 :            :          * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
     238                 :            :          */
     239                 :          0 :         return points > 0 ? points : 1;
     240                 :            : }
     241                 :            : 
     242                 :            : static const char * const oom_constraint_text[] = {
     243                 :            :         [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
     244                 :            :         [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
     245                 :            :         [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
     246                 :            :         [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
     247                 :            : };
     248                 :            : 
     249                 :            : /*
     250                 :            :  * Determine the type of allocation constraint.
     251                 :            :  */
     252                 :          0 : static enum oom_constraint constrained_alloc(struct oom_control *oc)
     253                 :            : {
     254                 :            :         struct zone *zone;
     255                 :            :         struct zoneref *z;
     256                 :            :         enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
     257                 :            :         bool cpuset_limited = false;
     258                 :            :         int nid;
     259                 :            : 
     260                 :          0 :         if (is_memcg_oom(oc)) {
     261                 :          0 :                 oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
     262                 :          0 :                 return CONSTRAINT_MEMCG;
     263                 :            :         }
     264                 :            : 
     265                 :            :         /* Default to all available memory */
     266                 :          0 :         oc->totalpages = totalram_pages() + total_swap_pages;
     267                 :            : 
     268                 :            :         if (!IS_ENABLED(CONFIG_NUMA))
     269                 :          0 :                 return CONSTRAINT_NONE;
     270                 :            : 
     271                 :            :         if (!oc->zonelist)
     272                 :            :                 return CONSTRAINT_NONE;
     273                 :            :         /*
     274                 :            :          * Reach here only when __GFP_NOFAIL is used. So, we should avoid
     275                 :            :          * to kill current.We have to random task kill in this case.
     276                 :            :          * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
     277                 :            :          */
     278                 :            :         if (oc->gfp_mask & __GFP_THISNODE)
     279                 :            :                 return CONSTRAINT_NONE;
     280                 :            : 
     281                 :            :         /*
     282                 :            :          * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
     283                 :            :          * the page allocator means a mempolicy is in effect.  Cpuset policy
     284                 :            :          * is enforced in get_page_from_freelist().
     285                 :            :          */
     286                 :            :         if (oc->nodemask &&
     287                 :            :             !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
     288                 :            :                 oc->totalpages = total_swap_pages;
     289                 :            :                 for_each_node_mask(nid, *oc->nodemask)
     290                 :            :                         oc->totalpages += node_present_pages(nid);
     291                 :            :                 return CONSTRAINT_MEMORY_POLICY;
     292                 :            :         }
     293                 :            : 
     294                 :            :         /* Check this allocation failure is caused by cpuset's wall function */
     295                 :            :         for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
     296                 :            :                         high_zoneidx, oc->nodemask)
     297                 :            :                 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
     298                 :            :                         cpuset_limited = true;
     299                 :            : 
     300                 :            :         if (cpuset_limited) {
     301                 :            :                 oc->totalpages = total_swap_pages;
     302                 :            :                 for_each_node_mask(nid, cpuset_current_mems_allowed)
     303                 :            :                         oc->totalpages += node_present_pages(nid);
     304                 :            :                 return CONSTRAINT_CPUSET;
     305                 :            :         }
     306                 :            :         return CONSTRAINT_NONE;
     307                 :            : }
     308                 :            : 
     309                 :          0 : static int oom_evaluate_task(struct task_struct *task, void *arg)
     310                 :            : {
     311                 :            :         struct oom_control *oc = arg;
     312                 :            :         unsigned long points;
     313                 :            : 
     314                 :          0 :         if (oom_unkillable_task(task))
     315                 :            :                 goto next;
     316                 :            : 
     317                 :            :         /* p may not have freeable memory in nodemask */
     318                 :            :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
     319                 :            :                 goto next;
     320                 :            : 
     321                 :            :         /*
     322                 :            :          * This task already has access to memory reserves and is being killed.
     323                 :            :          * Don't allow any other task to have access to the reserves unless
     324                 :            :          * the task has MMF_OOM_SKIP because chances that it would release
     325                 :            :          * any memory is quite low.
     326                 :            :          */
     327                 :          0 :         if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
     328                 :          0 :                 if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
     329                 :            :                         goto next;
     330                 :            :                 goto abort;
     331                 :            :         }
     332                 :            : 
     333                 :            :         /*
     334                 :            :          * If task is allocating a lot of memory and has been marked to be
     335                 :            :          * killed first if it triggers an oom, then select it.
     336                 :            :          */
     337                 :          0 :         if (oom_task_origin(task)) {
     338                 :            :                 points = ULONG_MAX;
     339                 :            :                 goto select;
     340                 :            :         }
     341                 :            : 
     342                 :          0 :         points = oom_badness(task, oc->totalpages);
     343                 :          0 :         if (!points || points < oc->chosen_points)
     344                 :            :                 goto next;
     345                 :            : 
     346                 :            : select:
     347                 :          0 :         if (oc->chosen)
     348                 :          0 :                 put_task_struct(oc->chosen);
     349                 :            :         get_task_struct(task);
     350                 :          0 :         oc->chosen = task;
     351                 :          0 :         oc->chosen_points = points;
     352                 :            : next:
     353                 :            :         return 0;
     354                 :            : abort:
     355                 :          0 :         if (oc->chosen)
     356                 :          0 :                 put_task_struct(oc->chosen);
     357                 :          0 :         oc->chosen = (void *)-1UL;
     358                 :          0 :         return 1;
     359                 :            : }
     360                 :            : 
     361                 :            : /*
     362                 :            :  * Simple selection loop. We choose the process with the highest number of
     363                 :            :  * 'points'. In case scan was aborted, oc->chosen is set to -1.
     364                 :            :  */
     365                 :          0 : static void select_bad_process(struct oom_control *oc)
     366                 :            : {
     367                 :          0 :         if (is_memcg_oom(oc))
     368                 :          0 :                 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
     369                 :            :         else {
     370                 :            :                 struct task_struct *p;
     371                 :            : 
     372                 :            :                 rcu_read_lock();
     373                 :          0 :                 for_each_process(p)
     374                 :          0 :                         if (oom_evaluate_task(p, oc))
     375                 :            :                                 break;
     376                 :            :                 rcu_read_unlock();
     377                 :            :         }
     378                 :          0 : }
     379                 :            : 
     380                 :          0 : static int dump_task(struct task_struct *p, void *arg)
     381                 :            : {
     382                 :            :         struct oom_control *oc = arg;
     383                 :            :         struct task_struct *task;
     384                 :            : 
     385                 :          0 :         if (oom_unkillable_task(p))
     386                 :            :                 return 0;
     387                 :            : 
     388                 :            :         /* p may not have freeable memory in nodemask */
     389                 :            :         if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
     390                 :            :                 return 0;
     391                 :            : 
     392                 :          0 :         task = find_lock_task_mm(p);
     393                 :          0 :         if (!task) {
     394                 :            :                 /*
     395                 :            :                  * This is a kthread or all of p's threads have already
     396                 :            :                  * detached their mm's.  There's no need to report
     397                 :            :                  * them; they can't be oom killed anyway.
     398                 :            :                  */
     399                 :            :                 return 0;
     400                 :            :         }
     401                 :            : 
     402                 :          0 :         pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
     403                 :            :                 task->pid, from_kuid(&init_user_ns, task_uid(task)),
     404                 :            :                 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
     405                 :            :                 mm_pgtables_bytes(task->mm),
     406                 :            :                 get_mm_counter(task->mm, MM_SWAPENTS),
     407                 :            :                 task->signal->oom_score_adj, task->comm);
     408                 :            :         task_unlock(task);
     409                 :            : 
     410                 :          0 :         return 0;
     411                 :            : }
     412                 :            : 
     413                 :            : /**
     414                 :            :  * dump_tasks - dump current memory state of all system tasks
     415                 :            :  * @oc: pointer to struct oom_control
     416                 :            :  *
     417                 :            :  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
     418                 :            :  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
     419                 :            :  * are not shown.
     420                 :            :  * State information includes task's pid, uid, tgid, vm size, rss,
     421                 :            :  * pgtables_bytes, swapents, oom_score_adj value, and name.
     422                 :            :  */
     423                 :          0 : static void dump_tasks(struct oom_control *oc)
     424                 :            : {
     425                 :          0 :         pr_info("Tasks state (memory values in pages):\n");
     426                 :          0 :         pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
     427                 :            : 
     428                 :          0 :         if (is_memcg_oom(oc))
     429                 :          0 :                 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
     430                 :            :         else {
     431                 :            :                 struct task_struct *p;
     432                 :            : 
     433                 :            :                 rcu_read_lock();
     434                 :          0 :                 for_each_process(p)
     435                 :          0 :                         dump_task(p, oc);
     436                 :            :                 rcu_read_unlock();
     437                 :            :         }
     438                 :          0 : }
     439                 :            : 
     440                 :          0 : static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
     441                 :            : {
     442                 :            :         /* one line summary of the oom killer context. */
     443                 :          0 :         pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
     444                 :            :                         oom_constraint_text[oc->constraint],
     445                 :            :                         nodemask_pr_args(oc->nodemask));
     446                 :          0 :         cpuset_print_current_mems_allowed();
     447                 :          0 :         mem_cgroup_print_oom_context(oc->memcg, victim);
     448                 :          0 :         pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
     449                 :            :                 from_kuid(&init_user_ns, task_uid(victim)));
     450                 :          0 : }
     451                 :            : 
     452                 :          0 : static void dump_header(struct oom_control *oc, struct task_struct *p)
     453                 :            : {
     454                 :          0 :         pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
     455                 :            :                 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
     456                 :            :                         current->signal->oom_score_adj);
     457                 :            :         if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
     458                 :            :                 pr_warn("COMPACTION is disabled!!!\n");
     459                 :            : 
     460                 :          0 :         dump_stack();
     461                 :          0 :         if (is_memcg_oom(oc))
     462                 :          0 :                 mem_cgroup_print_oom_meminfo(oc->memcg);
     463                 :            :         else {
     464                 :          0 :                 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
     465                 :          0 :                 if (is_dump_unreclaim_slabs())
     466                 :          0 :                         dump_unreclaimable_slab();
     467                 :            :         }
     468                 :          0 :         if (sysctl_oom_dump_tasks)
     469                 :          0 :                 dump_tasks(oc);
     470                 :          0 :         if (p)
     471                 :          0 :                 dump_oom_summary(oc, p);
     472                 :          0 : }
     473                 :            : 
     474                 :            : /*
     475                 :            :  * Number of OOM victims in flight
     476                 :            :  */
     477                 :            : static atomic_t oom_victims = ATOMIC_INIT(0);
     478                 :            : static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
     479                 :            : 
     480                 :            : static bool oom_killer_disabled __read_mostly;
     481                 :            : 
     482                 :            : #define K(x) ((x) << (PAGE_SHIFT-10))
     483                 :            : 
     484                 :            : /*
     485                 :            :  * task->mm can be NULL if the task is the exited group leader.  So to
     486                 :            :  * determine whether the task is using a particular mm, we examine all the
     487                 :            :  * task's threads: if one of those is using this mm then this task was also
     488                 :            :  * using it.
     489                 :            :  */
     490                 :          2 : bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
     491                 :            : {
     492                 :            :         struct task_struct *t;
     493                 :            : 
     494                 :          2 :         for_each_thread(p, t) {
     495                 :          2 :                 struct mm_struct *t_mm = READ_ONCE(t->mm);
     496                 :          2 :                 if (t_mm)
     497                 :          2 :                         return t_mm == mm;
     498                 :            :         }
     499                 :            :         return false;
     500                 :            : }
     501                 :            : 
     502                 :            : #ifdef CONFIG_MMU
     503                 :            : /*
     504                 :            :  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
     505                 :            :  * victim (if that is possible) to help the OOM killer to move on.
     506                 :            :  */
     507                 :            : static struct task_struct *oom_reaper_th;
     508                 :            : static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
     509                 :            : static struct task_struct *oom_reaper_list;
     510                 :            : static DEFINE_SPINLOCK(oom_reaper_lock);
     511                 :            : 
     512                 :          0 : bool __oom_reap_task_mm(struct mm_struct *mm)
     513                 :            : {
     514                 :            :         struct vm_area_struct *vma;
     515                 :            :         bool ret = true;
     516                 :            : 
     517                 :            :         /*
     518                 :            :          * Tell all users of get_user/copy_from_user etc... that the content
     519                 :            :          * is no longer stable. No barriers really needed because unmapping
     520                 :            :          * should imply barriers already and the reader would hit a page fault
     521                 :            :          * if it stumbled over a reaped memory.
     522                 :            :          */
     523                 :          0 :         set_bit(MMF_UNSTABLE, &mm->flags);
     524                 :            : 
     525                 :          0 :         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
     526                 :          0 :                 if (!can_madv_lru_vma(vma))
     527                 :          0 :                         continue;
     528                 :            : 
     529                 :            :                 /*
     530                 :            :                  * Only anonymous pages have a good chance to be dropped
     531                 :            :                  * without additional steps which we cannot afford as we
     532                 :            :                  * are OOM already.
     533                 :            :                  *
     534                 :            :                  * We do not even care about fs backed pages because all
     535                 :            :                  * which are reclaimable have already been reclaimed and
     536                 :            :                  * we do not want to block exit_mmap by keeping mm ref
     537                 :            :                  * count elevated without a good reason.
     538                 :            :                  */
     539                 :          0 :                 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
     540                 :            :                         struct mmu_notifier_range range;
     541                 :            :                         struct mmu_gather tlb;
     542                 :            : 
     543                 :          0 :                         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
     544                 :            :                                                 vma, mm, vma->vm_start,
     545                 :            :                                                 vma->vm_end);
     546                 :          0 :                         tlb_gather_mmu(&tlb, mm, range.start, range.end);
     547                 :            :                         if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
     548                 :            :                                 tlb_finish_mmu(&tlb, range.start, range.end);
     549                 :            :                                 ret = false;
     550                 :            :                                 continue;
     551                 :            :                         }
     552                 :          0 :                         unmap_page_range(&tlb, vma, range.start, range.end, NULL);
     553                 :            :                         mmu_notifier_invalidate_range_end(&range);
     554                 :          0 :                         tlb_finish_mmu(&tlb, range.start, range.end);
     555                 :            :                 }
     556                 :            :         }
     557                 :            : 
     558                 :          0 :         return ret;
     559                 :            : }
     560                 :            : 
     561                 :            : /*
     562                 :            :  * Reaps the address space of the give task.
     563                 :            :  *
     564                 :            :  * Returns true on success and false if none or part of the address space
     565                 :            :  * has been reclaimed and the caller should retry later.
     566                 :            :  */
     567                 :          0 : static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
     568                 :            : {
     569                 :            :         bool ret = true;
     570                 :            : 
     571                 :          0 :         if (!down_read_trylock(&mm->mmap_sem)) {
     572                 :          0 :                 trace_skip_task_reaping(tsk->pid);
     573                 :          0 :                 return false;
     574                 :            :         }
     575                 :            : 
     576                 :            :         /*
     577                 :            :          * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
     578                 :            :          * work on the mm anymore. The check for MMF_OOM_SKIP must run
     579                 :            :          * under mmap_sem for reading because it serializes against the
     580                 :            :          * down_write();up_write() cycle in exit_mmap().
     581                 :            :          */
     582                 :          0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
     583                 :          0 :                 trace_skip_task_reaping(tsk->pid);
     584                 :          0 :                 goto out_unlock;
     585                 :            :         }
     586                 :            : 
     587                 :          0 :         trace_start_task_reaping(tsk->pid);
     588                 :            : 
     589                 :            :         /* failed to reap part of the address space. Try again later */
     590                 :          0 :         ret = __oom_reap_task_mm(mm);
     591                 :          0 :         if (!ret)
     592                 :            :                 goto out_finish;
     593                 :            : 
     594                 :          0 :         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
     595                 :            :                         task_pid_nr(tsk), tsk->comm,
     596                 :            :                         K(get_mm_counter(mm, MM_ANONPAGES)),
     597                 :            :                         K(get_mm_counter(mm, MM_FILEPAGES)),
     598                 :            :                         K(get_mm_counter(mm, MM_SHMEMPAGES)));
     599                 :            : out_finish:
     600                 :          0 :         trace_finish_task_reaping(tsk->pid);
     601                 :            : out_unlock:
     602                 :          0 :         up_read(&mm->mmap_sem);
     603                 :            : 
     604                 :          0 :         return ret;
     605                 :            : }
     606                 :            : 
     607                 :            : #define MAX_OOM_REAP_RETRIES 10
     608                 :          0 : static void oom_reap_task(struct task_struct *tsk)
     609                 :            : {
     610                 :            :         int attempts = 0;
     611                 :          0 :         struct mm_struct *mm = tsk->signal->oom_mm;
     612                 :            : 
     613                 :            :         /* Retry the down_read_trylock(mmap_sem) a few times */
     614                 :          0 :         while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
     615                 :          0 :                 schedule_timeout_idle(HZ/10);
     616                 :            : 
     617                 :          0 :         if (attempts <= MAX_OOM_REAP_RETRIES ||
     618                 :            :             test_bit(MMF_OOM_SKIP, &mm->flags))
     619                 :            :                 goto done;
     620                 :            : 
     621                 :          0 :         pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
     622                 :            :                 task_pid_nr(tsk), tsk->comm);
     623                 :            :         debug_show_all_locks();
     624                 :            : 
     625                 :            : done:
     626                 :          0 :         tsk->oom_reaper_list = NULL;
     627                 :            : 
     628                 :            :         /*
     629                 :            :          * Hide this mm from OOM killer because it has been either reaped or
     630                 :            :          * somebody can't call up_write(mmap_sem).
     631                 :            :          */
     632                 :          0 :         set_bit(MMF_OOM_SKIP, &mm->flags);
     633                 :            : 
     634                 :            :         /* Drop a reference taken by wake_oom_reaper */
     635                 :          0 :         put_task_struct(tsk);
     636                 :          0 : }
     637                 :            : 
     638                 :          3 : static int oom_reaper(void *unused)
     639                 :            : {
     640                 :            :         while (true) {
     641                 :            :                 struct task_struct *tsk = NULL;
     642                 :            : 
     643                 :          3 :                 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
     644                 :            :                 spin_lock(&oom_reaper_lock);
     645                 :          0 :                 if (oom_reaper_list != NULL) {
     646                 :            :                         tsk = oom_reaper_list;
     647                 :          0 :                         oom_reaper_list = tsk->oom_reaper_list;
     648                 :            :                 }
     649                 :            :                 spin_unlock(&oom_reaper_lock);
     650                 :            : 
     651                 :          0 :                 if (tsk)
     652                 :          0 :                         oom_reap_task(tsk);
     653                 :            :         }
     654                 :            : 
     655                 :            :         return 0;
     656                 :            : }
     657                 :            : 
     658                 :          0 : static void wake_oom_reaper(struct task_struct *tsk)
     659                 :            : {
     660                 :            :         /* mm is already queued? */
     661                 :          0 :         if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
     662                 :          0 :                 return;
     663                 :            : 
     664                 :            :         get_task_struct(tsk);
     665                 :            : 
     666                 :            :         spin_lock(&oom_reaper_lock);
     667                 :          0 :         tsk->oom_reaper_list = oom_reaper_list;
     668                 :          0 :         oom_reaper_list = tsk;
     669                 :            :         spin_unlock(&oom_reaper_lock);
     670                 :          0 :         trace_wake_reaper(tsk->pid);
     671                 :          0 :         wake_up(&oom_reaper_wait);
     672                 :            : }
     673                 :            : 
     674                 :          3 : static int __init oom_init(void)
     675                 :            : {
     676                 :          3 :         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
     677                 :          3 :         return 0;
     678                 :            : }
     679                 :            : subsys_initcall(oom_init)
     680                 :            : #else
     681                 :            : static inline void wake_oom_reaper(struct task_struct *tsk)
     682                 :            : {
     683                 :            : }
     684                 :            : #endif /* CONFIG_MMU */
     685                 :            : 
     686                 :            : /**
     687                 :            :  * mark_oom_victim - mark the given task as OOM victim
     688                 :            :  * @tsk: task to mark
     689                 :            :  *
     690                 :            :  * Has to be called with oom_lock held and never after
     691                 :            :  * oom has been disabled already.
     692                 :            :  *
     693                 :            :  * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
     694                 :            :  * under task_lock or operate on the current).
     695                 :            :  */
     696                 :          0 : static void mark_oom_victim(struct task_struct *tsk)
     697                 :            : {
     698                 :          0 :         struct mm_struct *mm = tsk->mm;
     699                 :            : 
     700                 :          0 :         WARN_ON(oom_killer_disabled);
     701                 :            :         /* OOM killer might race with memcg OOM */
     702                 :          0 :         if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
     703                 :          0 :                 return;
     704                 :            : 
     705                 :            :         /* oom_mm is bound to the signal struct life time. */
     706                 :          0 :         if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
     707                 :          0 :                 mmgrab(tsk->signal->oom_mm);
     708                 :          0 :                 set_bit(MMF_OOM_VICTIM, &mm->flags);
     709                 :            :         }
     710                 :            : 
     711                 :            :         /*
     712                 :            :          * Make sure that the task is woken up from uninterruptible sleep
     713                 :            :          * if it is frozen because OOM killer wouldn't be able to free
     714                 :            :          * any memory and livelock. freezing_slow_path will tell the freezer
     715                 :            :          * that TIF_MEMDIE tasks should be ignored.
     716                 :            :          */
     717                 :          0 :         __thaw_task(tsk);
     718                 :            :         atomic_inc(&oom_victims);
     719                 :          0 :         trace_mark_victim(tsk->pid);
     720                 :            : }
     721                 :            : 
     722                 :            : /**
     723                 :            :  * exit_oom_victim - note the exit of an OOM victim
     724                 :            :  */
     725                 :          0 : void exit_oom_victim(void)
     726                 :            : {
     727                 :            :         clear_thread_flag(TIF_MEMDIE);
     728                 :            : 
     729                 :          0 :         if (!atomic_dec_return(&oom_victims))
     730                 :          0 :                 wake_up_all(&oom_victims_wait);
     731                 :          0 : }
     732                 :            : 
     733                 :            : /**
     734                 :            :  * oom_killer_enable - enable OOM killer
     735                 :            :  */
     736                 :          0 : void oom_killer_enable(void)
     737                 :            : {
     738                 :          0 :         oom_killer_disabled = false;
     739                 :          0 :         pr_info("OOM killer enabled.\n");
     740                 :          0 : }
     741                 :            : 
     742                 :            : /**
     743                 :            :  * oom_killer_disable - disable OOM killer
     744                 :            :  * @timeout: maximum timeout to wait for oom victims in jiffies
     745                 :            :  *
     746                 :            :  * Forces all page allocations to fail rather than trigger OOM killer.
     747                 :            :  * Will block and wait until all OOM victims are killed or the given
     748                 :            :  * timeout expires.
     749                 :            :  *
     750                 :            :  * The function cannot be called when there are runnable user tasks because
     751                 :            :  * the userspace would see unexpected allocation failures as a result. Any
     752                 :            :  * new usage of this function should be consulted with MM people.
     753                 :            :  *
     754                 :            :  * Returns true if successful and false if the OOM killer cannot be
     755                 :            :  * disabled.
     756                 :            :  */
     757                 :          0 : bool oom_killer_disable(signed long timeout)
     758                 :            : {
     759                 :            :         signed long ret;
     760                 :            : 
     761                 :            :         /*
     762                 :            :          * Make sure to not race with an ongoing OOM killer. Check that the
     763                 :            :          * current is not killed (possibly due to sharing the victim's memory).
     764                 :            :          */
     765                 :          0 :         if (mutex_lock_killable(&oom_lock))
     766                 :            :                 return false;
     767                 :          0 :         oom_killer_disabled = true;
     768                 :          0 :         mutex_unlock(&oom_lock);
     769                 :            : 
     770                 :          0 :         ret = wait_event_interruptible_timeout(oom_victims_wait,
     771                 :            :                         !atomic_read(&oom_victims), timeout);
     772                 :          0 :         if (ret <= 0) {
     773                 :            :                 oom_killer_enable();
     774                 :          0 :                 return false;
     775                 :            :         }
     776                 :          0 :         pr_info("OOM killer disabled.\n");
     777                 :            : 
     778                 :          0 :         return true;
     779                 :            : }
     780                 :            : 
     781                 :            : static inline bool __task_will_free_mem(struct task_struct *task)
     782                 :            : {
     783                 :          0 :         struct signal_struct *sig = task->signal;
     784                 :            : 
     785                 :            :         /*
     786                 :            :          * A coredumping process may sleep for an extended period in exit_mm(),
     787                 :            :          * so the oom killer cannot assume that the process will promptly exit
     788                 :            :          * and release memory.
     789                 :            :          */
     790                 :          0 :         if (sig->flags & SIGNAL_GROUP_COREDUMP)
     791                 :            :                 return false;
     792                 :            : 
     793                 :          0 :         if (sig->flags & SIGNAL_GROUP_EXIT)
     794                 :            :                 return true;
     795                 :            : 
     796                 :          0 :         if (thread_group_empty(task) && (task->flags & PF_EXITING))
     797                 :            :                 return true;
     798                 :            : 
     799                 :            :         return false;
     800                 :            : }
     801                 :            : 
     802                 :            : /*
     803                 :            :  * Checks whether the given task is dying or exiting and likely to
     804                 :            :  * release its address space. This means that all threads and processes
     805                 :            :  * sharing the same mm have to be killed or exiting.
     806                 :            :  * Caller has to make sure that task->mm is stable (hold task_lock or
     807                 :            :  * it operates on the current).
     808                 :            :  */
     809                 :          0 : static bool task_will_free_mem(struct task_struct *task)
     810                 :            : {
     811                 :          0 :         struct mm_struct *mm = task->mm;
     812                 :            :         struct task_struct *p;
     813                 :            :         bool ret = true;
     814                 :            : 
     815                 :            :         /*
     816                 :            :          * Skip tasks without mm because it might have passed its exit_mm and
     817                 :            :          * exit_oom_victim. oom_reaper could have rescued that but do not rely
     818                 :            :          * on that for now. We can consider find_lock_task_mm in future.
     819                 :            :          */
     820                 :          0 :         if (!mm)
     821                 :            :                 return false;
     822                 :            : 
     823                 :          0 :         if (!__task_will_free_mem(task))
     824                 :            :                 return false;
     825                 :            : 
     826                 :            :         /*
     827                 :            :          * This task has already been drained by the oom reaper so there are
     828                 :            :          * only small chances it will free some more
     829                 :            :          */
     830                 :          0 :         if (test_bit(MMF_OOM_SKIP, &mm->flags))
     831                 :            :                 return false;
     832                 :            : 
     833                 :          0 :         if (atomic_read(&mm->mm_users) <= 1)
     834                 :            :                 return true;
     835                 :            : 
     836                 :            :         /*
     837                 :            :          * Make sure that all tasks which share the mm with the given tasks
     838                 :            :          * are dying as well to make sure that a) nobody pins its mm and
     839                 :            :          * b) the task is also reapable by the oom reaper.
     840                 :            :          */
     841                 :            :         rcu_read_lock();
     842                 :          0 :         for_each_process(p) {
     843                 :          0 :                 if (!process_shares_mm(p, mm))
     844                 :          0 :                         continue;
     845                 :          0 :                 if (same_thread_group(task, p))
     846                 :          0 :                         continue;
     847                 :            :                 ret = __task_will_free_mem(p);
     848                 :          0 :                 if (!ret)
     849                 :            :                         break;
     850                 :            :         }
     851                 :            :         rcu_read_unlock();
     852                 :            : 
     853                 :          0 :         return ret;
     854                 :            : }
     855                 :            : 
     856                 :          0 : static void __oom_kill_process(struct task_struct *victim, const char *message)
     857                 :            : {
     858                 :            :         struct task_struct *p;
     859                 :            :         struct mm_struct *mm;
     860                 :            :         bool can_oom_reap = true;
     861                 :            : 
     862                 :          0 :         p = find_lock_task_mm(victim);
     863                 :          0 :         if (!p) {
     864                 :          0 :                 put_task_struct(victim);
     865                 :          0 :                 return;
     866                 :          0 :         } else if (victim != p) {
     867                 :            :                 get_task_struct(p);
     868                 :          0 :                 put_task_struct(victim);
     869                 :            :                 victim = p;
     870                 :            :         }
     871                 :            : 
     872                 :            :         /* Get a reference to safely compare mm after task_unlock(victim) */
     873                 :          0 :         mm = victim->mm;
     874                 :            :         mmgrab(mm);
     875                 :            : 
     876                 :            :         /* Raise event before sending signal: task reaper must see this */
     877                 :            :         count_vm_event(OOM_KILL);
     878                 :          0 :         memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
     879                 :            : 
     880                 :            :         /*
     881                 :            :          * We should send SIGKILL before granting access to memory reserves
     882                 :            :          * in order to prevent the OOM victim from depleting the memory
     883                 :            :          * reserves from the user space under its control.
     884                 :            :          */
     885                 :          0 :         do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
     886                 :          0 :         mark_oom_victim(victim);
     887                 :          0 :         pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
     888                 :            :                 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
     889                 :            :                 K(get_mm_counter(mm, MM_ANONPAGES)),
     890                 :            :                 K(get_mm_counter(mm, MM_FILEPAGES)),
     891                 :            :                 K(get_mm_counter(mm, MM_SHMEMPAGES)),
     892                 :            :                 from_kuid(&init_user_ns, task_uid(victim)),
     893                 :            :                 mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
     894                 :            :         task_unlock(victim);
     895                 :            : 
     896                 :            :         /*
     897                 :            :          * Kill all user processes sharing victim->mm in other thread groups, if
     898                 :            :          * any.  They don't get access to memory reserves, though, to avoid
     899                 :            :          * depletion of all memory.  This prevents mm->mmap_sem livelock when an
     900                 :            :          * oom killed thread cannot exit because it requires the semaphore and
     901                 :            :          * its contended by another thread trying to allocate memory itself.
     902                 :            :          * That thread will now get access to memory reserves since it has a
     903                 :            :          * pending fatal signal.
     904                 :            :          */
     905                 :            :         rcu_read_lock();
     906                 :          0 :         for_each_process(p) {
     907                 :          0 :                 if (!process_shares_mm(p, mm))
     908                 :          0 :                         continue;
     909                 :          0 :                 if (same_thread_group(p, victim))
     910                 :          0 :                         continue;
     911                 :          0 :                 if (is_global_init(p)) {
     912                 :            :                         can_oom_reap = false;
     913                 :          0 :                         set_bit(MMF_OOM_SKIP, &mm->flags);
     914                 :          0 :                         pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
     915                 :            :                                         task_pid_nr(victim), victim->comm,
     916                 :            :                                         task_pid_nr(p), p->comm);
     917                 :          0 :                         continue;
     918                 :            :                 }
     919                 :            :                 /*
     920                 :            :                  * No use_mm() user needs to read from the userspace so we are
     921                 :            :                  * ok to reap it.
     922                 :            :                  */
     923                 :          0 :                 if (unlikely(p->flags & PF_KTHREAD))
     924                 :          0 :                         continue;
     925                 :          0 :                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
     926                 :            :         }
     927                 :            :         rcu_read_unlock();
     928                 :            : 
     929                 :          0 :         if (can_oom_reap)
     930                 :          0 :                 wake_oom_reaper(victim);
     931                 :            : 
     932                 :          0 :         mmdrop(mm);
     933                 :          0 :         put_task_struct(victim);
     934                 :            : }
     935                 :            : #undef K
     936                 :            : 
     937                 :            : /*
     938                 :            :  * Kill provided task unless it's secured by setting
     939                 :            :  * oom_score_adj to OOM_SCORE_ADJ_MIN.
     940                 :            :  */
     941                 :          0 : static int oom_kill_memcg_member(struct task_struct *task, void *message)
     942                 :            : {
     943                 :          0 :         if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
     944                 :            :             !is_global_init(task)) {
     945                 :            :                 get_task_struct(task);
     946                 :          0 :                 __oom_kill_process(task, message);
     947                 :            :         }
     948                 :          0 :         return 0;
     949                 :            : }
     950                 :            : 
     951                 :          0 : static void oom_kill_process(struct oom_control *oc, const char *message)
     952                 :            : {
     953                 :          0 :         struct task_struct *victim = oc->chosen;
     954                 :            :         struct mem_cgroup *oom_group;
     955                 :            :         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
     956                 :            :                                               DEFAULT_RATELIMIT_BURST);
     957                 :            : 
     958                 :            :         /*
     959                 :            :          * If the task is already exiting, don't alarm the sysadmin or kill
     960                 :            :          * its children or threads, just give it access to memory reserves
     961                 :            :          * so it can die quickly
     962                 :            :          */
     963                 :            :         task_lock(victim);
     964                 :          0 :         if (task_will_free_mem(victim)) {
     965                 :          0 :                 mark_oom_victim(victim);
     966                 :          0 :                 wake_oom_reaper(victim);
     967                 :            :                 task_unlock(victim);
     968                 :          0 :                 put_task_struct(victim);
     969                 :          0 :                 return;
     970                 :            :         }
     971                 :            :         task_unlock(victim);
     972                 :            : 
     973                 :          0 :         if (__ratelimit(&oom_rs))
     974                 :          0 :                 dump_header(oc, victim);
     975                 :            : 
     976                 :            :         /*
     977                 :            :          * Do we need to kill the entire memory cgroup?
     978                 :            :          * Or even one of the ancestor memory cgroups?
     979                 :            :          * Check this out before killing the victim task.
     980                 :            :          */
     981                 :          0 :         oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
     982                 :            : 
     983                 :          0 :         __oom_kill_process(victim, message);
     984                 :            : 
     985                 :            :         /*
     986                 :            :          * If necessary, kill all tasks in the selected memory cgroup.
     987                 :            :          */
     988                 :          0 :         if (oom_group) {
     989                 :          0 :                 mem_cgroup_print_oom_group(oom_group);
     990                 :          0 :                 mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
     991                 :            :                                       (void*)message);
     992                 :          0 :                 mem_cgroup_put(oom_group);
     993                 :            :         }
     994                 :            : }
     995                 :            : 
     996                 :            : /*
     997                 :            :  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
     998                 :            :  */
     999                 :          0 : static void check_panic_on_oom(struct oom_control *oc)
    1000                 :            : {
    1001                 :          0 :         if (likely(!sysctl_panic_on_oom))
    1002                 :            :                 return;
    1003                 :          0 :         if (sysctl_panic_on_oom != 2) {
    1004                 :            :                 /*
    1005                 :            :                  * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
    1006                 :            :                  * does not panic for cpuset, mempolicy, or memcg allocation
    1007                 :            :                  * failures.
    1008                 :            :                  */
    1009                 :          0 :                 if (oc->constraint != CONSTRAINT_NONE)
    1010                 :            :                         return;
    1011                 :            :         }
    1012                 :            :         /* Do not panic for oom kills triggered by sysrq */
    1013                 :          0 :         if (is_sysrq_oom(oc))
    1014                 :            :                 return;
    1015                 :          0 :         dump_header(oc, NULL);
    1016                 :          0 :         panic("Out of memory: %s panic_on_oom is enabled\n",
    1017                 :          0 :                 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
    1018                 :            : }
    1019                 :            : 
    1020                 :            : static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
    1021                 :            : 
    1022                 :          0 : int register_oom_notifier(struct notifier_block *nb)
    1023                 :            : {
    1024                 :          0 :         return blocking_notifier_chain_register(&oom_notify_list, nb);
    1025                 :            : }
    1026                 :            : EXPORT_SYMBOL_GPL(register_oom_notifier);
    1027                 :            : 
    1028                 :          0 : int unregister_oom_notifier(struct notifier_block *nb)
    1029                 :            : {
    1030                 :          0 :         return blocking_notifier_chain_unregister(&oom_notify_list, nb);
    1031                 :            : }
    1032                 :            : EXPORT_SYMBOL_GPL(unregister_oom_notifier);
    1033                 :            : 
    1034                 :            : /**
    1035                 :            :  * out_of_memory - kill the "best" process when we run out of memory
    1036                 :            :  * @oc: pointer to struct oom_control
    1037                 :            :  *
    1038                 :            :  * If we run out of memory, we have the choice between either
    1039                 :            :  * killing a random task (bad), letting the system crash (worse)
    1040                 :            :  * OR try to be smart about which process to kill. Note that we
    1041                 :            :  * don't have to be perfect here, we just have to be good.
    1042                 :            :  */
    1043                 :          0 : bool out_of_memory(struct oom_control *oc)
    1044                 :            : {
    1045                 :          0 :         unsigned long freed = 0;
    1046                 :            : 
    1047                 :          0 :         if (oom_killer_disabled)
    1048                 :            :                 return false;
    1049                 :            : 
    1050                 :          0 :         if (!is_memcg_oom(oc)) {
    1051                 :          0 :                 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
    1052                 :          0 :                 if (freed > 0)
    1053                 :            :                         /* Got some memory back in the last second. */
    1054                 :            :                         return true;
    1055                 :            :         }
    1056                 :            : 
    1057                 :            :         /*
    1058                 :            :          * If current has a pending SIGKILL or is exiting, then automatically
    1059                 :            :          * select it.  The goal is to allow it to allocate so that it may
    1060                 :            :          * quickly exit and free its memory.
    1061                 :            :          */
    1062                 :          0 :         if (task_will_free_mem(current)) {
    1063                 :          0 :                 mark_oom_victim(current);
    1064                 :          0 :                 wake_oom_reaper(current);
    1065                 :          0 :                 return true;
    1066                 :            :         }
    1067                 :            : 
    1068                 :            :         /*
    1069                 :            :          * The OOM killer does not compensate for IO-less reclaim.
    1070                 :            :          * pagefault_out_of_memory lost its gfp context so we have to
    1071                 :            :          * make sure exclude 0 mask - all other users should have at least
    1072                 :            :          * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
    1073                 :            :          * invoke the OOM killer even if it is a GFP_NOFS allocation.
    1074                 :            :          */
    1075                 :          0 :         if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
    1076                 :            :                 return true;
    1077                 :            : 
    1078                 :            :         /*
    1079                 :            :          * Check if there were limitations on the allocation (only relevant for
    1080                 :            :          * NUMA and memcg) that may require different handling.
    1081                 :            :          */
    1082                 :          0 :         oc->constraint = constrained_alloc(oc);
    1083                 :          0 :         if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
    1084                 :          0 :                 oc->nodemask = NULL;
    1085                 :          0 :         check_panic_on_oom(oc);
    1086                 :            : 
    1087                 :          0 :         if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
    1088                 :          0 :             current->mm && !oom_unkillable_task(current) &&
    1089                 :          0 :             oom_cpuset_eligible(current, oc) &&
    1090                 :          0 :             current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
    1091                 :            :                 get_task_struct(current);
    1092                 :          0 :                 oc->chosen = current;
    1093                 :          0 :                 oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
    1094                 :          0 :                 return true;
    1095                 :            :         }
    1096                 :            : 
    1097                 :          0 :         select_bad_process(oc);
    1098                 :            :         /* Found nothing?!?! */
    1099                 :          0 :         if (!oc->chosen) {
    1100                 :          0 :                 dump_header(oc, NULL);
    1101                 :          0 :                 pr_warn("Out of memory and no killable processes...\n");
    1102                 :            :                 /*
    1103                 :            :                  * If we got here due to an actual allocation at the
    1104                 :            :                  * system level, we cannot survive this and will enter
    1105                 :            :                  * an endless loop in the allocator. Bail out now.
    1106                 :            :                  */
    1107                 :          0 :                 if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
    1108                 :          0 :                         panic("System is deadlocked on memory\n");
    1109                 :            :         }
    1110                 :          0 :         if (oc->chosen && oc->chosen != (void *)-1UL)
    1111                 :          0 :                 oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
    1112                 :            :                                  "Memory cgroup out of memory");
    1113                 :          0 :         return !!oc->chosen;
    1114                 :            : }
    1115                 :            : 
    1116                 :            : /*
    1117                 :            :  * The pagefault handler calls here because it is out of memory, so kill a
    1118                 :            :  * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
    1119                 :            :  * killing is already in progress so do nothing.
    1120                 :            :  */
    1121                 :          0 : void pagefault_out_of_memory(void)
    1122                 :            : {
    1123                 :          0 :         struct oom_control oc = {
    1124                 :            :                 .zonelist = NULL,
    1125                 :            :                 .nodemask = NULL,
    1126                 :            :                 .memcg = NULL,
    1127                 :            :                 .gfp_mask = 0,
    1128                 :            :                 .order = 0,
    1129                 :            :         };
    1130                 :            : 
    1131                 :          0 :         if (mem_cgroup_oom_synchronize(true))
    1132                 :          0 :                 return;
    1133                 :            : 
    1134                 :          0 :         if (!mutex_trylock(&oom_lock))
    1135                 :            :                 return;
    1136                 :          0 :         out_of_memory(&oc);
    1137                 :          0 :         mutex_unlock(&oom_lock);
    1138                 :            : }

Generated by: LCOV version 1.14