LCOV - code coverage report
Current view: top level - kernel/sched - loadavg.c (source / functions) Hit Total Coverage
Test: combined.info Lines: 38 100 38.0 %
Date: 2022-04-01 14:58:12 Functions: 6 11 54.5 %
Branches: 8 46 17.4 %

           Branch data     Line data    Source code
       1                 :            : // SPDX-License-Identifier: GPL-2.0
       2                 :            : /*
       3                 :            :  * kernel/sched/loadavg.c
       4                 :            :  *
       5                 :            :  * This file contains the magic bits required to compute the global loadavg
       6                 :            :  * figure. Its a silly number but people think its important. We go through
       7                 :            :  * great pains to make it work on big machines and tickless kernels.
       8                 :            :  */
       9                 :            : #include "sched.h"
      10                 :            : 
      11                 :            : /*
      12                 :            :  * Global load-average calculations
      13                 :            :  *
      14                 :            :  * We take a distributed and async approach to calculating the global load-avg
      15                 :            :  * in order to minimize overhead.
      16                 :            :  *
      17                 :            :  * The global load average is an exponentially decaying average of nr_running +
      18                 :            :  * nr_uninterruptible.
      19                 :            :  *
      20                 :            :  * Once every LOAD_FREQ:
      21                 :            :  *
      22                 :            :  *   nr_active = 0;
      23                 :            :  *   for_each_possible_cpu(cpu)
      24                 :            :  *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
      25                 :            :  *
      26                 :            :  *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
      27                 :            :  *
      28                 :            :  * Due to a number of reasons the above turns in the mess below:
      29                 :            :  *
      30                 :            :  *  - for_each_possible_cpu() is prohibitively expensive on machines with
      31                 :            :  *    serious number of CPUs, therefore we need to take a distributed approach
      32                 :            :  *    to calculating nr_active.
      33                 :            :  *
      34                 :            :  *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
      35                 :            :  *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
      36                 :            :  *
      37                 :            :  *    So assuming nr_active := 0 when we start out -- true per definition, we
      38                 :            :  *    can simply take per-CPU deltas and fold those into a global accumulate
      39                 :            :  *    to obtain the same result. See calc_load_fold_active().
      40                 :            :  *
      41                 :            :  *    Furthermore, in order to avoid synchronizing all per-CPU delta folding
      42                 :            :  *    across the machine, we assume 10 ticks is sufficient time for every
      43                 :            :  *    CPU to have completed this task.
      44                 :            :  *
      45                 :            :  *    This places an upper-bound on the IRQ-off latency of the machine. Then
      46                 :            :  *    again, being late doesn't loose the delta, just wrecks the sample.
      47                 :            :  *
      48                 :            :  *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
      49                 :            :  *    this would add another cross-CPU cacheline miss and atomic operation
      50                 :            :  *    to the wakeup path. Instead we increment on whatever CPU the task ran
      51                 :            :  *    when it went into uninterruptible state and decrement on whatever CPU
      52                 :            :  *    did the wakeup. This means that only the sum of nr_uninterruptible over
      53                 :            :  *    all CPUs yields the correct result.
      54                 :            :  *
      55                 :            :  *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
      56                 :            :  */
      57                 :            : 
      58                 :            : /* Variables and functions for calc_load */
      59                 :            : atomic_long_t calc_load_tasks;
      60                 :            : unsigned long calc_load_update;
      61                 :            : unsigned long avenrun[3];
      62                 :            : EXPORT_SYMBOL(avenrun); /* should be removed */
      63                 :            : 
      64                 :            : /**
      65                 :            :  * get_avenrun - get the load average array
      66                 :            :  * @loads:      pointer to dest load array
      67                 :            :  * @offset:     offset to add
      68                 :            :  * @shift:      shift count to shift the result left
      69                 :            :  *
      70                 :            :  * These values are estimates at best, so no need for locking.
      71                 :            :  */
      72                 :         24 : void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
      73                 :            : {
      74                 :         24 :         loads[0] = (avenrun[0] + offset) << shift;
      75                 :         24 :         loads[1] = (avenrun[1] + offset) << shift;
      76                 :         24 :         loads[2] = (avenrun[2] + offset) << shift;
      77                 :         24 : }
      78                 :            : 
      79                 :         53 : long calc_load_fold_active(struct rq *this_rq, long adjust)
      80                 :            : {
      81                 :         53 :         long nr_active, delta = 0;
      82                 :            : 
      83                 :         53 :         nr_active = this_rq->nr_running - adjust;
      84                 :         53 :         nr_active += (long)this_rq->nr_uninterruptible;
      85                 :            : 
      86         [ #  # ]:          0 :         if (nr_active != this_rq->calc_load_active) {
      87                 :         16 :                 delta = nr_active - this_rq->calc_load_active;
      88                 :          0 :                 this_rq->calc_load_active = nr_active;
      89                 :            :         }
      90                 :            : 
      91                 :         16 :         return delta;
      92                 :            : }
      93                 :            : 
      94                 :            : /**
      95                 :            :  * fixed_power_int - compute: x^n, in O(log n) time
      96                 :            :  *
      97                 :            :  * @x:         base of the power
      98                 :            :  * @frac_bits: fractional bits of @x
      99                 :            :  * @n:         power to raise @x to.
     100                 :            :  *
     101                 :            :  * By exploiting the relation between the definition of the natural power
     102                 :            :  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
     103                 :            :  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
     104                 :            :  * (where: n_i \elem {0, 1}, the binary vector representing n),
     105                 :            :  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
     106                 :            :  * of course trivially computable in O(log_2 n), the length of our binary
     107                 :            :  * vector.
     108                 :            :  */
     109                 :            : static unsigned long
     110                 :          0 : fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
     111                 :            : {
     112                 :          0 :         unsigned long result = 1UL << frac_bits;
     113                 :            : 
     114                 :          0 :         if (n) {
     115                 :          0 :                 for (;;) {
     116         [ #  # ]:          0 :                         if (n & 1) {
     117                 :          0 :                                 result *= x;
     118                 :          0 :                                 result += 1UL << (frac_bits - 1);
     119                 :          0 :                                 result >>= frac_bits;
     120                 :            :                         }
     121                 :          0 :                         n >>= 1;
     122         [ #  # ]:          0 :                         if (!n)
     123                 :            :                                 break;
     124                 :          0 :                         x *= x;
     125                 :          0 :                         x += 1UL << (frac_bits - 1);
     126                 :          0 :                         x >>= frac_bits;
     127                 :            :                 }
     128                 :            :         }
     129                 :            : 
     130         [ #  # ]:          0 :         return result;
     131                 :            : }
     132                 :            : 
     133                 :            : /*
     134                 :            :  * a1 = a0 * e + a * (1 - e)
     135                 :            :  *
     136                 :            :  * a2 = a1 * e + a * (1 - e)
     137                 :            :  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
     138                 :            :  *    = a0 * e^2 + a * (1 - e) * (1 + e)
     139                 :            :  *
     140                 :            :  * a3 = a2 * e + a * (1 - e)
     141                 :            :  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
     142                 :            :  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
     143                 :            :  *
     144                 :            :  *  ...
     145                 :            :  *
     146                 :            :  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
     147                 :            :  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
     148                 :            :  *    = a0 * e^n + a * (1 - e^n)
     149                 :            :  *
     150                 :            :  * [1] application of the geometric series:
     151                 :            :  *
     152                 :            :  *              n         1 - x^(n+1)
     153                 :            :  *     S_n := \Sum x^i = -------------
     154                 :            :  *             i=0          1 - x
     155                 :            :  */
     156                 :            : unsigned long
     157                 :          0 : calc_load_n(unsigned long load, unsigned long exp,
     158                 :            :             unsigned long active, unsigned int n)
     159                 :            : {
     160         [ #  # ]:          0 :         return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
     161                 :            : }
     162                 :            : 
     163                 :            : #ifdef CONFIG_NO_HZ_COMMON
     164                 :            : /*
     165                 :            :  * Handle NO_HZ for the global load-average.
     166                 :            :  *
     167                 :            :  * Since the above described distributed algorithm to compute the global
     168                 :            :  * load-average relies on per-CPU sampling from the tick, it is affected by
     169                 :            :  * NO_HZ.
     170                 :            :  *
     171                 :            :  * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
     172                 :            :  * entering NO_HZ state such that we can include this as an 'extra' CPU delta
     173                 :            :  * when we read the global state.
     174                 :            :  *
     175                 :            :  * Obviously reality has to ruin such a delightfully simple scheme:
     176                 :            :  *
     177                 :            :  *  - When we go NO_HZ idle during the window, we can negate our sample
     178                 :            :  *    contribution, causing under-accounting.
     179                 :            :  *
     180                 :            :  *    We avoid this by keeping two NO_HZ-delta counters and flipping them
     181                 :            :  *    when the window starts, thus separating old and new NO_HZ load.
     182                 :            :  *
     183                 :            :  *    The only trick is the slight shift in index flip for read vs write.
     184                 :            :  *
     185                 :            :  *        0s            5s            10s           15s
     186                 :            :  *          +10           +10           +10           +10
     187                 :            :  *        |-|-----------|-|-----------|-|-----------|-|
     188                 :            :  *    r:0 0 1           1 0           0 1           1 0
     189                 :            :  *    w:0 1 1           0 0           1 1           0 0
     190                 :            :  *
     191                 :            :  *    This ensures we'll fold the old NO_HZ contribution in this window while
     192                 :            :  *    accumlating the new one.
     193                 :            :  *
     194                 :            :  *  - When we wake up from NO_HZ during the window, we push up our
     195                 :            :  *    contribution, since we effectively move our sample point to a known
     196                 :            :  *    busy state.
     197                 :            :  *
     198                 :            :  *    This is solved by pushing the window forward, and thus skipping the
     199                 :            :  *    sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
     200                 :            :  *    was in effect at the time the window opened). This also solves the issue
     201                 :            :  *    of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
     202                 :            :  *    intervals.
     203                 :            :  *
     204                 :            :  * When making the ILB scale, we should try to pull this in as well.
     205                 :            :  */
     206                 :            : static atomic_long_t calc_load_nohz[2];
     207                 :            : static int calc_load_idx;
     208                 :            : 
     209                 :         16 : static inline int calc_load_write_idx(void)
     210                 :            : {
     211                 :         16 :         int idx = calc_load_idx;
     212                 :            : 
     213                 :            :         /*
     214                 :            :          * See calc_global_nohz(), if we observe the new index, we also
     215                 :            :          * need to observe the new update time.
     216                 :            :          */
     217                 :         16 :         smp_rmb();
     218                 :            : 
     219                 :            :         /*
     220                 :            :          * If the folding window started, make sure we start writing in the
     221                 :            :          * next NO_HZ-delta.
     222                 :            :          */
     223         [ -  + ]:         16 :         if (!time_before(jiffies, READ_ONCE(calc_load_update)))
     224                 :          0 :                 idx++;
     225                 :            : 
     226                 :         16 :         return idx & 1;
     227                 :            : }
     228                 :            : 
     229                 :          0 : static inline int calc_load_read_idx(void)
     230                 :            : {
     231                 :          0 :         return calc_load_idx & 1;
     232                 :            : }
     233                 :            : 
     234                 :         53 : static void calc_load_nohz_fold(struct rq *rq)
     235                 :            : {
     236                 :         53 :         long delta;
     237                 :            : 
     238         [ +  + ]:         53 :         delta = calc_load_fold_active(rq, 0);
     239                 :         16 :         if (delta) {
     240                 :         16 :                 int idx = calc_load_write_idx();
     241                 :            : 
     242                 :         16 :                 atomic_long_add(delta, &calc_load_nohz[idx]);
     243                 :            :         }
     244                 :         53 : }
     245                 :            : 
     246                 :         53 : void calc_load_nohz_start(void)
     247                 :            : {
     248                 :            :         /*
     249                 :            :          * We're going into NO_HZ mode, if there's any pending delta, fold it
     250                 :            :          * into the pending NO_HZ delta.
     251                 :            :          */
     252                 :         53 :         calc_load_nohz_fold(this_rq());
     253                 :         53 : }
     254                 :            : 
     255                 :            : /*
     256                 :            :  * Keep track of the load for NOHZ_FULL, must be called between
     257                 :            :  * calc_load_nohz_{start,stop}().
     258                 :            :  */
     259                 :          0 : void calc_load_nohz_remote(struct rq *rq)
     260                 :            : {
     261                 :          0 :         calc_load_nohz_fold(rq);
     262                 :          0 : }
     263                 :            : 
     264                 :         53 : void calc_load_nohz_stop(void)
     265                 :            : {
     266                 :         53 :         struct rq *this_rq = this_rq();
     267                 :            : 
     268                 :            :         /*
     269                 :            :          * If we're still before the pending sample window, we're done.
     270                 :            :          */
     271         [ -  + ]:         53 :         this_rq->calc_load_update = READ_ONCE(calc_load_update);
     272         [ -  + ]:         53 :         if (time_before(jiffies, this_rq->calc_load_update))
     273                 :            :                 return;
     274                 :            : 
     275                 :            :         /*
     276                 :            :          * We woke inside or after the sample window, this means we're already
     277                 :            :          * accounted through the nohz accounting, so skip the entire deal and
     278                 :            :          * sync up for the next window.
     279                 :            :          */
     280         [ #  # ]:          0 :         if (time_before(jiffies, this_rq->calc_load_update + 10))
     281                 :          0 :                 this_rq->calc_load_update += LOAD_FREQ;
     282                 :            : }
     283                 :            : 
     284                 :          0 : static long calc_load_nohz_read(void)
     285                 :            : {
     286                 :          0 :         int idx = calc_load_read_idx();
     287                 :          0 :         long delta = 0;
     288                 :            : 
     289         [ #  # ]:          0 :         if (atomic_long_read(&calc_load_nohz[idx]))
     290                 :          0 :                 delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
     291                 :            : 
     292                 :          0 :         return delta;
     293                 :            : }
     294                 :            : 
     295                 :            : /*
     296                 :            :  * NO_HZ can leave us missing all per-CPU ticks calling
     297                 :            :  * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
     298                 :            :  * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
     299                 :            :  * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
     300                 :            :  *
     301                 :            :  * Once we've updated the global active value, we need to apply the exponential
     302                 :            :  * weights adjusted to the number of cycles missed.
     303                 :            :  */
     304                 :          0 : static void calc_global_nohz(void)
     305                 :            : {
     306                 :          0 :         unsigned long sample_window;
     307                 :          0 :         long delta, active, n;
     308                 :            : 
     309         [ #  # ]:          0 :         sample_window = READ_ONCE(calc_load_update);
     310         [ #  # ]:          0 :         if (!time_before(jiffies, sample_window + 10)) {
     311                 :            :                 /*
     312                 :            :                  * Catch-up, fold however many we are behind still
     313                 :            :                  */
     314                 :          0 :                 delta = jiffies - sample_window - 10;
     315                 :          0 :                 n = 1 + (delta / LOAD_FREQ);
     316                 :            : 
     317                 :          0 :                 active = atomic_long_read(&calc_load_tasks);
     318         [ #  # ]:          0 :                 active = active > 0 ? active * FIXED_1 : 0;
     319                 :            : 
     320                 :          0 :                 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
     321                 :          0 :                 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
     322                 :          0 :                 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
     323                 :            : 
     324                 :          0 :                 WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
     325                 :            :         }
     326                 :            : 
     327                 :            :         /*
     328                 :            :          * Flip the NO_HZ index...
     329                 :            :          *
     330                 :            :          * Make sure we first write the new time then flip the index, so that
     331                 :            :          * calc_load_write_idx() will see the new time when it reads the new
     332                 :            :          * index, this avoids a double flip messing things up.
     333                 :            :          */
     334                 :          0 :         smp_wmb();
     335                 :          0 :         calc_load_idx++;
     336                 :          0 : }
     337                 :            : #else /* !CONFIG_NO_HZ_COMMON */
     338                 :            : 
     339                 :            : static inline long calc_load_nohz_read(void) { return 0; }
     340                 :            : static inline void calc_global_nohz(void) { }
     341                 :            : 
     342                 :            : #endif /* CONFIG_NO_HZ_COMMON */
     343                 :            : 
     344                 :            : /*
     345                 :            :  * calc_load - update the avenrun load estimates 10 ticks after the
     346                 :            :  * CPUs have updated calc_load_tasks.
     347                 :            :  *
     348                 :            :  * Called from the global timer code.
     349                 :            :  */
     350                 :       6820 : void calc_global_load(unsigned long ticks)
     351                 :            : {
     352                 :       6820 :         unsigned long sample_window;
     353                 :       6820 :         long active, delta;
     354                 :            : 
     355         [ -  + ]:       6820 :         sample_window = READ_ONCE(calc_load_update);
     356         [ -  + ]:       6820 :         if (time_before(jiffies, sample_window + 10))
     357                 :            :                 return;
     358                 :            : 
     359                 :            :         /*
     360                 :            :          * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
     361                 :            :          */
     362                 :          0 :         delta = calc_load_nohz_read();
     363         [ #  # ]:          0 :         if (delta)
     364                 :          0 :                 atomic_long_add(delta, &calc_load_tasks);
     365                 :            : 
     366                 :          0 :         active = atomic_long_read(&calc_load_tasks);
     367         [ #  # ]:          0 :         active = active > 0 ? active * FIXED_1 : 0;
     368                 :            : 
     369         [ #  # ]:          0 :         avenrun[0] = calc_load(avenrun[0], EXP_1, active);
     370         [ #  # ]:          0 :         avenrun[1] = calc_load(avenrun[1], EXP_5, active);
     371         [ #  # ]:          0 :         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
     372                 :            : 
     373                 :          0 :         WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
     374                 :            : 
     375                 :            :         /*
     376                 :            :          * In case we went to NO_HZ for multiple LOAD_FREQ intervals
     377                 :            :          * catch up in bulk.
     378                 :            :          */
     379                 :          0 :         calc_global_nohz();
     380                 :            : }
     381                 :            : 
     382                 :            : /*
     383                 :            :  * Called from scheduler_tick() to periodically update this CPU's
     384                 :            :  * active count.
     385                 :            :  */
     386                 :       6809 : void calc_global_load_tick(struct rq *this_rq)
     387                 :            : {
     388                 :       6809 :         long delta;
     389                 :            : 
     390         [ -  + ]:       6809 :         if (time_before(jiffies, this_rq->calc_load_update))
     391                 :            :                 return;
     392                 :            : 
     393         [ #  # ]:          0 :         delta  = calc_load_fold_active(this_rq, 0);
     394                 :          0 :         if (delta)
     395                 :          0 :                 atomic_long_add(delta, &calc_load_tasks);
     396                 :            : 
     397                 :          0 :         this_rq->calc_load_update += LOAD_FREQ;
     398                 :            : }

Generated by: LCOV version 1.14