Branch data Line data Source code
1 : : /* SPDX-License-Identifier: GPL-2.0 */
2 : : #ifndef _LINUX_MMZONE_H
3 : : #define _LINUX_MMZONE_H
4 : :
5 : : #ifndef __ASSEMBLY__
6 : : #ifndef __GENERATING_BOUNDS_H
7 : :
8 : : #include <linux/spinlock.h>
9 : : #include <linux/list.h>
10 : : #include <linux/wait.h>
11 : : #include <linux/bitops.h>
12 : : #include <linux/cache.h>
13 : : #include <linux/threads.h>
14 : : #include <linux/numa.h>
15 : : #include <linux/init.h>
16 : : #include <linux/seqlock.h>
17 : : #include <linux/nodemask.h>
18 : : #include <linux/pageblock-flags.h>
19 : : #include <linux/page-flags-layout.h>
20 : : #include <linux/atomic.h>
21 : : #include <linux/mm_types.h>
22 : : #include <linux/page-flags.h>
23 : : #include <asm/page.h>
24 : :
25 : : /* Free memory management - zoned buddy allocator. */
26 : : #ifndef CONFIG_FORCE_MAX_ZONEORDER
27 : : #define MAX_ORDER 11
28 : : #else
29 : : #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
30 : : #endif
31 : : #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
32 : :
33 : : /*
34 : : * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
35 : : * costly to service. That is between allocation orders which should
36 : : * coalesce naturally under reasonable reclaim pressure and those which
37 : : * will not.
38 : : */
39 : : #define PAGE_ALLOC_COSTLY_ORDER 3
40 : :
41 : : enum migratetype {
42 : : MIGRATE_UNMOVABLE,
43 : : MIGRATE_MOVABLE,
44 : : MIGRATE_RECLAIMABLE,
45 : : MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
46 : : MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
47 : : #ifdef CONFIG_CMA
48 : : /*
49 : : * MIGRATE_CMA migration type is designed to mimic the way
50 : : * ZONE_MOVABLE works. Only movable pages can be allocated
51 : : * from MIGRATE_CMA pageblocks and page allocator never
52 : : * implicitly change migration type of MIGRATE_CMA pageblock.
53 : : *
54 : : * The way to use it is to change migratetype of a range of
55 : : * pageblocks to MIGRATE_CMA which can be done by
56 : : * __free_pageblock_cma() function. What is important though
57 : : * is that a range of pageblocks must be aligned to
58 : : * MAX_ORDER_NR_PAGES should biggest page be bigger then
59 : : * a single pageblock.
60 : : */
61 : : MIGRATE_CMA,
62 : : #endif
63 : : #ifdef CONFIG_MEMORY_ISOLATION
64 : : MIGRATE_ISOLATE, /* can't allocate from here */
65 : : #endif
66 : : MIGRATE_TYPES
67 : : };
68 : :
69 : : /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
70 : : extern const char * const migratetype_names[MIGRATE_TYPES];
71 : :
72 : : #ifdef CONFIG_CMA
73 : : # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
74 : : # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
75 : : #else
76 : : # define is_migrate_cma(migratetype) false
77 : : # define is_migrate_cma_page(_page) false
78 : : #endif
79 : :
80 : 0 : static inline bool is_migrate_movable(int mt)
81 : : {
82 : 0 : return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
83 : : }
84 : :
85 : : #define for_each_migratetype_order(order, type) \
86 : : for (order = 0; order < MAX_ORDER; order++) \
87 : : for (type = 0; type < MIGRATE_TYPES; type++)
88 : :
89 : : extern int page_group_by_mobility_disabled;
90 : :
91 : : #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
92 : : #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
93 : :
94 : : #define get_pageblock_migratetype(page) \
95 : : get_pfnblock_flags_mask(page, page_to_pfn(page), \
96 : : PB_migrate_end, MIGRATETYPE_MASK)
97 : :
98 : : struct free_area {
99 : : struct list_head free_list[MIGRATE_TYPES];
100 : : unsigned long nr_free;
101 : : };
102 : :
103 : : /* Used for pages not on another list */
104 : 1931318 : static inline void add_to_free_area(struct page *page, struct free_area *area,
105 : : int migratetype)
106 : : {
107 : 1931318 : list_add(&page->lru, &area->free_list[migratetype]);
108 : 1302272 : area->nr_free++;
109 : 629046 : }
110 : :
111 : : /* Used for pages not on another list */
112 : 202392 : static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
113 : : int migratetype)
114 : : {
115 : 202392 : list_add_tail(&page->lru, &area->free_list[migratetype]);
116 : 202392 : area->nr_free++;
117 : : }
118 : :
119 : : #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
120 : : /* Used to preserve page allocation order entropy */
121 : : void add_to_free_area_random(struct page *page, struct free_area *area,
122 : : int migratetype);
123 : : #else
124 : : static inline void add_to_free_area_random(struct page *page,
125 : : struct free_area *area, int migratetype)
126 : : {
127 : : add_to_free_area(page, area, migratetype);
128 : : }
129 : : #endif
130 : :
131 : : /* Used for pages which are on another list */
132 : 701 : static inline void move_to_free_area(struct page *page, struct free_area *area,
133 : : int migratetype)
134 : : {
135 : 701 : list_move(&page->lru, &area->free_list[migratetype]);
136 : : }
137 : :
138 : 2960497 : static inline struct page *get_page_from_free_area(struct free_area *area,
139 : : int migratetype)
140 : : {
141 [ - - - - : 2960497 : return list_first_entry_or_null(&area->free_list[migratetype],
+ + - + +
- + + - +
+ - - - -
- ]
142 : : struct page, lru);
143 : : }
144 : :
145 : 2122909 : static inline void del_page_from_free_area(struct page *page,
146 : : struct free_area *area)
147 : : {
148 [ - - ]: 2122909 : list_del(&page->lru);
149 [ - - ]: 2122909 : __ClearPageBuddy(page);
150 : 2122909 : set_page_private(page, 0);
151 [ - - ]: 2122909 : area->nr_free--;
152 : : }
153 : :
154 : 390430 : static inline bool free_area_empty(struct free_area *area, int migratetype)
155 : : {
156 [ - - + + : 390430 : return list_empty(&area->free_list[migratetype]);
+ + ]
157 : : }
158 : :
159 : : struct pglist_data;
160 : :
161 : : /*
162 : : * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
163 : : * So add a wild amount of padding here to ensure that they fall into separate
164 : : * cachelines. There are very few zone structures in the machine, so space
165 : : * consumption is not a concern here.
166 : : */
167 : : #if defined(CONFIG_SMP)
168 : : struct zone_padding {
169 : : char x[0];
170 : : } ____cacheline_internodealigned_in_smp;
171 : : #define ZONE_PADDING(name) struct zone_padding name;
172 : : #else
173 : : #define ZONE_PADDING(name)
174 : : #endif
175 : :
176 : : #ifdef CONFIG_NUMA
177 : : enum numa_stat_item {
178 : : NUMA_HIT, /* allocated in intended node */
179 : : NUMA_MISS, /* allocated in non intended node */
180 : : NUMA_FOREIGN, /* was intended here, hit elsewhere */
181 : : NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
182 : : NUMA_LOCAL, /* allocation from local node */
183 : : NUMA_OTHER, /* allocation from other node */
184 : : NR_VM_NUMA_STAT_ITEMS
185 : : };
186 : : #else
187 : : #define NR_VM_NUMA_STAT_ITEMS 0
188 : : #endif
189 : :
190 : : enum zone_stat_item {
191 : : /* First 128 byte cacheline (assuming 64 bit words) */
192 : : NR_FREE_PAGES,
193 : : NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
194 : : NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
195 : : NR_ZONE_ACTIVE_ANON,
196 : : NR_ZONE_INACTIVE_FILE,
197 : : NR_ZONE_ACTIVE_FILE,
198 : : NR_ZONE_UNEVICTABLE,
199 : : NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
200 : : NR_MLOCK, /* mlock()ed pages found and moved off LRU */
201 : : NR_PAGETABLE, /* used for pagetables */
202 : : NR_KERNEL_STACK_KB, /* measured in KiB */
203 : : /* Second 128 byte cacheline */
204 : : NR_BOUNCE,
205 : : #if IS_ENABLED(CONFIG_ZSMALLOC)
206 : : NR_ZSPAGES, /* allocated in zsmalloc */
207 : : #endif
208 : : NR_FREE_CMA_PAGES,
209 : : NR_VM_ZONE_STAT_ITEMS };
210 : :
211 : : enum node_stat_item {
212 : : NR_LRU_BASE,
213 : : NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
214 : : NR_ACTIVE_ANON, /* " " " " " */
215 : : NR_INACTIVE_FILE, /* " " " " " */
216 : : NR_ACTIVE_FILE, /* " " " " " */
217 : : NR_UNEVICTABLE, /* " " " " " */
218 : : NR_SLAB_RECLAIMABLE,
219 : : NR_SLAB_UNRECLAIMABLE,
220 : : NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
221 : : NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
222 : : WORKINGSET_NODES,
223 : : WORKINGSET_REFAULT,
224 : : WORKINGSET_ACTIVATE,
225 : : WORKINGSET_RESTORE,
226 : : WORKINGSET_NODERECLAIM,
227 : : NR_ANON_MAPPED, /* Mapped anonymous pages */
228 : : NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
229 : : only modified from process context */
230 : : NR_FILE_PAGES,
231 : : NR_FILE_DIRTY,
232 : : NR_WRITEBACK,
233 : : NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
234 : : NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
235 : : NR_SHMEM_THPS,
236 : : NR_SHMEM_PMDMAPPED,
237 : : NR_FILE_THPS,
238 : : NR_FILE_PMDMAPPED,
239 : : NR_ANON_THPS,
240 : : NR_UNSTABLE_NFS, /* NFS unstable pages */
241 : : NR_VMSCAN_WRITE,
242 : : NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
243 : : NR_DIRTIED, /* page dirtyings since bootup */
244 : : NR_WRITTEN, /* page writings since bootup */
245 : : NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
246 : : NR_VM_NODE_STAT_ITEMS
247 : : };
248 : :
249 : : /*
250 : : * We do arithmetic on the LRU lists in various places in the code,
251 : : * so it is important to keep the active lists LRU_ACTIVE higher in
252 : : * the array than the corresponding inactive lists, and to keep
253 : : * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
254 : : *
255 : : * This has to be kept in sync with the statistics in zone_stat_item
256 : : * above and the descriptions in vmstat_text in mm/vmstat.c
257 : : */
258 : : #define LRU_BASE 0
259 : : #define LRU_ACTIVE 1
260 : : #define LRU_FILE 2
261 : :
262 : : enum lru_list {
263 : : LRU_INACTIVE_ANON = LRU_BASE,
264 : : LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
265 : : LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
266 : : LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
267 : : LRU_UNEVICTABLE,
268 : : NR_LRU_LISTS
269 : : };
270 : :
271 : : #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
272 : :
273 : : #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
274 : :
275 : 0 : static inline bool is_file_lru(enum lru_list lru)
276 : : {
277 [ # # ]: 0 : return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
278 : : }
279 : :
280 : 0 : static inline bool is_active_lru(enum lru_list lru)
281 : : {
282 [ # # ]: 0 : return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
283 : : }
284 : :
285 : : struct zone_reclaim_stat {
286 : : /*
287 : : * The pageout code in vmscan.c keeps track of how many of the
288 : : * mem/swap backed and file backed pages are referenced.
289 : : * The higher the rotated/scanned ratio, the more valuable
290 : : * that cache is.
291 : : *
292 : : * The anon LRU stats live in [0], file LRU stats in [1]
293 : : */
294 : : unsigned long recent_rotated[2];
295 : : unsigned long recent_scanned[2];
296 : : };
297 : :
298 : : enum lruvec_flags {
299 : : LRUVEC_CONGESTED, /* lruvec has many dirty pages
300 : : * backed by a congested BDI
301 : : */
302 : : };
303 : :
304 : : struct lruvec {
305 : : struct list_head lists[NR_LRU_LISTS];
306 : : struct zone_reclaim_stat reclaim_stat;
307 : : /* Evictions & activations on the inactive file list */
308 : : atomic_long_t inactive_age;
309 : : /* Refaults at the time of last reclaim cycle */
310 : : unsigned long refaults;
311 : : /* Various lruvec state flags (enum lruvec_flags) */
312 : : unsigned long flags;
313 : : #ifdef CONFIG_MEMCG
314 : : struct pglist_data *pgdat;
315 : : #endif
316 : : };
317 : :
318 : : /* Isolate unmapped pages */
319 : : #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
320 : : /* Isolate for asynchronous migration */
321 : : #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
322 : : /* Isolate unevictable pages */
323 : : #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8)
324 : :
325 : : /* LRU Isolation modes. */
326 : : typedef unsigned __bitwise isolate_mode_t;
327 : :
328 : : enum zone_watermarks {
329 : : WMARK_MIN,
330 : : WMARK_LOW,
331 : : WMARK_HIGH,
332 : : NR_WMARK
333 : : };
334 : :
335 : : #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
336 : : #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
337 : : #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
338 : : #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
339 : :
340 : : struct per_cpu_pages {
341 : : int count; /* number of pages in the list */
342 : : int high; /* high watermark, emptying needed */
343 : : int batch; /* chunk size for buddy add/remove */
344 : :
345 : : /* Lists of pages, one per migrate type stored on the pcp-lists */
346 : : struct list_head lists[MIGRATE_PCPTYPES];
347 : : };
348 : :
349 : : struct per_cpu_pageset {
350 : : struct per_cpu_pages pcp;
351 : : #ifdef CONFIG_NUMA
352 : : s8 expire;
353 : : u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
354 : : #endif
355 : : #ifdef CONFIG_SMP
356 : : s8 stat_threshold;
357 : : s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
358 : : #endif
359 : : };
360 : :
361 : : struct per_cpu_nodestat {
362 : : s8 stat_threshold;
363 : : s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
364 : : };
365 : :
366 : : #endif /* !__GENERATING_BOUNDS.H */
367 : :
368 : : enum zone_type {
369 : : /*
370 : : * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
371 : : * to DMA to all of the addressable memory (ZONE_NORMAL).
372 : : * On architectures where this area covers the whole 32 bit address
373 : : * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
374 : : * DMA addressing constraints. This distinction is important as a 32bit
375 : : * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
376 : : * platforms may need both zones as they support peripherals with
377 : : * different DMA addressing limitations.
378 : : *
379 : : * Some examples:
380 : : *
381 : : * - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the
382 : : * rest of the lower 4G.
383 : : *
384 : : * - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on
385 : : * the specific device.
386 : : *
387 : : * - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the
388 : : * lower 4G.
389 : : *
390 : : * - powerpc only uses ZONE_DMA, the size, up to 2G, may vary
391 : : * depending on the specific device.
392 : : *
393 : : * - s390 uses ZONE_DMA fixed to the lower 2G.
394 : : *
395 : : * - ia64 and riscv only use ZONE_DMA32.
396 : : *
397 : : * - parisc uses neither.
398 : : */
399 : : #ifdef CONFIG_ZONE_DMA
400 : : ZONE_DMA,
401 : : #endif
402 : : #ifdef CONFIG_ZONE_DMA32
403 : : ZONE_DMA32,
404 : : #endif
405 : : /*
406 : : * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
407 : : * performed on pages in ZONE_NORMAL if the DMA devices support
408 : : * transfers to all addressable memory.
409 : : */
410 : : ZONE_NORMAL,
411 : : #ifdef CONFIG_HIGHMEM
412 : : /*
413 : : * A memory area that is only addressable by the kernel through
414 : : * mapping portions into its own address space. This is for example
415 : : * used by i386 to allow the kernel to address the memory beyond
416 : : * 900MB. The kernel will set up special mappings (page
417 : : * table entries on i386) for each page that the kernel needs to
418 : : * access.
419 : : */
420 : : ZONE_HIGHMEM,
421 : : #endif
422 : : ZONE_MOVABLE,
423 : : #ifdef CONFIG_ZONE_DEVICE
424 : : ZONE_DEVICE,
425 : : #endif
426 : : __MAX_NR_ZONES
427 : :
428 : : };
429 : :
430 : : #ifndef __GENERATING_BOUNDS_H
431 : :
432 : : struct zone {
433 : : /* Read-mostly fields */
434 : :
435 : : /* zone watermarks, access with *_wmark_pages(zone) macros */
436 : : unsigned long _watermark[NR_WMARK];
437 : : unsigned long watermark_boost;
438 : :
439 : : unsigned long nr_reserved_highatomic;
440 : :
441 : : /*
442 : : * We don't know if the memory that we're going to allocate will be
443 : : * freeable or/and it will be released eventually, so to avoid totally
444 : : * wasting several GB of ram we must reserve some of the lower zone
445 : : * memory (otherwise we risk to run OOM on the lower zones despite
446 : : * there being tons of freeable ram on the higher zones). This array is
447 : : * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
448 : : * changes.
449 : : */
450 : : long lowmem_reserve[MAX_NR_ZONES];
451 : :
452 : : #ifdef CONFIG_NUMA
453 : : int node;
454 : : #endif
455 : : struct pglist_data *zone_pgdat;
456 : : struct per_cpu_pageset __percpu *pageset;
457 : :
458 : : #ifndef CONFIG_SPARSEMEM
459 : : /*
460 : : * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
461 : : * In SPARSEMEM, this map is stored in struct mem_section
462 : : */
463 : : unsigned long *pageblock_flags;
464 : : #endif /* CONFIG_SPARSEMEM */
465 : :
466 : : /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
467 : : unsigned long zone_start_pfn;
468 : :
469 : : /*
470 : : * spanned_pages is the total pages spanned by the zone, including
471 : : * holes, which is calculated as:
472 : : * spanned_pages = zone_end_pfn - zone_start_pfn;
473 : : *
474 : : * present_pages is physical pages existing within the zone, which
475 : : * is calculated as:
476 : : * present_pages = spanned_pages - absent_pages(pages in holes);
477 : : *
478 : : * managed_pages is present pages managed by the buddy system, which
479 : : * is calculated as (reserved_pages includes pages allocated by the
480 : : * bootmem allocator):
481 : : * managed_pages = present_pages - reserved_pages;
482 : : *
483 : : * So present_pages may be used by memory hotplug or memory power
484 : : * management logic to figure out unmanaged pages by checking
485 : : * (present_pages - managed_pages). And managed_pages should be used
486 : : * by page allocator and vm scanner to calculate all kinds of watermarks
487 : : * and thresholds.
488 : : *
489 : : * Locking rules:
490 : : *
491 : : * zone_start_pfn and spanned_pages are protected by span_seqlock.
492 : : * It is a seqlock because it has to be read outside of zone->lock,
493 : : * and it is done in the main allocator path. But, it is written
494 : : * quite infrequently.
495 : : *
496 : : * The span_seq lock is declared along with zone->lock because it is
497 : : * frequently read in proximity to zone->lock. It's good to
498 : : * give them a chance of being in the same cacheline.
499 : : *
500 : : * Write access to present_pages at runtime should be protected by
501 : : * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
502 : : * present_pages should get_online_mems() to get a stable value.
503 : : */
504 : : atomic_long_t managed_pages;
505 : : unsigned long spanned_pages;
506 : : unsigned long present_pages;
507 : :
508 : : const char *name;
509 : :
510 : : #ifdef CONFIG_MEMORY_ISOLATION
511 : : /*
512 : : * Number of isolated pageblock. It is used to solve incorrect
513 : : * freepage counting problem due to racy retrieving migratetype
514 : : * of pageblock. Protected by zone->lock.
515 : : */
516 : : unsigned long nr_isolate_pageblock;
517 : : #endif
518 : :
519 : : #ifdef CONFIG_MEMORY_HOTPLUG
520 : : /* see spanned/present_pages for more description */
521 : : seqlock_t span_seqlock;
522 : : #endif
523 : :
524 : : int initialized;
525 : :
526 : : /* Write-intensive fields used from the page allocator */
527 : : ZONE_PADDING(_pad1_)
528 : :
529 : : /* free areas of different sizes */
530 : : struct free_area free_area[MAX_ORDER];
531 : :
532 : : /* zone flags, see below */
533 : : unsigned long flags;
534 : :
535 : : /* Primarily protects free_area */
536 : : spinlock_t lock;
537 : :
538 : : /* Write-intensive fields used by compaction and vmstats. */
539 : : ZONE_PADDING(_pad2_)
540 : :
541 : : /*
542 : : * When free pages are below this point, additional steps are taken
543 : : * when reading the number of free pages to avoid per-cpu counter
544 : : * drift allowing watermarks to be breached
545 : : */
546 : : unsigned long percpu_drift_mark;
547 : :
548 : : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
549 : : /* pfn where compaction free scanner should start */
550 : : unsigned long compact_cached_free_pfn;
551 : : /* pfn where async and sync compaction migration scanner should start */
552 : : unsigned long compact_cached_migrate_pfn[2];
553 : : unsigned long compact_init_migrate_pfn;
554 : : unsigned long compact_init_free_pfn;
555 : : #endif
556 : :
557 : : #ifdef CONFIG_COMPACTION
558 : : /*
559 : : * On compaction failure, 1<<compact_defer_shift compactions
560 : : * are skipped before trying again. The number attempted since
561 : : * last failure is tracked with compact_considered.
562 : : */
563 : : unsigned int compact_considered;
564 : : unsigned int compact_defer_shift;
565 : : int compact_order_failed;
566 : : #endif
567 : :
568 : : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
569 : : /* Set to true when the PG_migrate_skip bits should be cleared */
570 : : bool compact_blockskip_flush;
571 : : #endif
572 : :
573 : : bool contiguous;
574 : :
575 : : ZONE_PADDING(_pad3_)
576 : : /* Zone statistics */
577 : : atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
578 : : atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
579 : : } ____cacheline_internodealigned_in_smp;
580 : :
581 : : enum pgdat_flags {
582 : : PGDAT_DIRTY, /* reclaim scanning has recently found
583 : : * many dirty file pages at the tail
584 : : * of the LRU.
585 : : */
586 : : PGDAT_WRITEBACK, /* reclaim scanning has recently found
587 : : * many pages under writeback
588 : : */
589 : : PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
590 : : };
591 : :
592 : : enum zone_flags {
593 : : ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
594 : : * Cleared when kswapd is woken.
595 : : */
596 : : };
597 : :
598 : 1960 : static inline unsigned long zone_managed_pages(struct zone *zone)
599 : : {
600 : 1512 : return (unsigned long)atomic_long_read(&zone->managed_pages);
601 : : }
602 : :
603 : 28728 : static inline unsigned long zone_end_pfn(const struct zone *zone)
604 : : {
605 [ + + # # : 14392 : return zone->zone_start_pfn + zone->spanned_pages;
# # # # #
# ]
606 : : }
607 : :
608 : 0 : static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
609 : : {
610 [ # # # # : 0 : return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
# # # # #
# # # ]
611 : : }
612 : :
613 : : static inline bool zone_is_initialized(struct zone *zone)
614 : : {
615 : : return zone->initialized;
616 : : }
617 : :
618 : 0 : static inline bool zone_is_empty(struct zone *zone)
619 : : {
620 [ # # ]: 0 : return zone->spanned_pages == 0;
621 : : }
622 : :
623 : : /*
624 : : * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
625 : : * intersection with the given zone
626 : : */
627 : : static inline bool zone_intersects(struct zone *zone,
628 : : unsigned long start_pfn, unsigned long nr_pages)
629 : : {
630 : : if (zone_is_empty(zone))
631 : : return false;
632 : : if (start_pfn >= zone_end_pfn(zone) ||
633 : : start_pfn + nr_pages <= zone->zone_start_pfn)
634 : : return false;
635 : :
636 : : return true;
637 : : }
638 : :
639 : : /*
640 : : * The "priority" of VM scanning is how much of the queues we will scan in one
641 : : * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
642 : : * queues ("queue_length >> 12") during an aging round.
643 : : */
644 : : #define DEF_PRIORITY 12
645 : :
646 : : /* Maximum number of zones on a zonelist */
647 : : #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
648 : :
649 : : enum {
650 : : ZONELIST_FALLBACK, /* zonelist with fallback */
651 : : #ifdef CONFIG_NUMA
652 : : /*
653 : : * The NUMA zonelists are doubled because we need zonelists that
654 : : * restrict the allocations to a single node for __GFP_THISNODE.
655 : : */
656 : : ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
657 : : #endif
658 : : MAX_ZONELISTS
659 : : };
660 : :
661 : : /*
662 : : * This struct contains information about a zone in a zonelist. It is stored
663 : : * here to avoid dereferences into large structures and lookups of tables
664 : : */
665 : : struct zoneref {
666 : : struct zone *zone; /* Pointer to actual zone */
667 : : int zone_idx; /* zone_idx(zoneref->zone) */
668 : : };
669 : :
670 : : /*
671 : : * One allocation request operates on a zonelist. A zonelist
672 : : * is a list of zones, the first one is the 'goal' of the
673 : : * allocation, the other zones are fallback zones, in decreasing
674 : : * priority.
675 : : *
676 : : * To speed the reading of the zonelist, the zonerefs contain the zone index
677 : : * of the entry being read. Helper functions to access information given
678 : : * a struct zoneref are
679 : : *
680 : : * zonelist_zone() - Return the struct zone * for an entry in _zonerefs
681 : : * zonelist_zone_idx() - Return the index of the zone for an entry
682 : : * zonelist_node_idx() - Return the index of the node for an entry
683 : : */
684 : : struct zonelist {
685 : : struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
686 : : };
687 : :
688 : : #ifndef CONFIG_DISCONTIGMEM
689 : : /* The array of struct pages - for discontigmem use pgdat->lmem_map */
690 : : extern struct page *mem_map;
691 : : #endif
692 : :
693 : : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
694 : : struct deferred_split {
695 : : spinlock_t split_queue_lock;
696 : : struct list_head split_queue;
697 : : unsigned long split_queue_len;
698 : : };
699 : : #endif
700 : :
701 : : /*
702 : : * On NUMA machines, each NUMA node would have a pg_data_t to describe
703 : : * it's memory layout. On UMA machines there is a single pglist_data which
704 : : * describes the whole memory.
705 : : *
706 : : * Memory statistics and page replacement data structures are maintained on a
707 : : * per-zone basis.
708 : : */
709 : : struct bootmem_data;
710 : : typedef struct pglist_data {
711 : : struct zone node_zones[MAX_NR_ZONES];
712 : : struct zonelist node_zonelists[MAX_ZONELISTS];
713 : : int nr_zones;
714 : : #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
715 : : struct page *node_mem_map;
716 : : #ifdef CONFIG_PAGE_EXTENSION
717 : : struct page_ext *node_page_ext;
718 : : #endif
719 : : #endif
720 : : #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
721 : : /*
722 : : * Must be held any time you expect node_start_pfn,
723 : : * node_present_pages, node_spanned_pages or nr_zones to stay constant.
724 : : *
725 : : * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
726 : : * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
727 : : * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
728 : : *
729 : : * Nests above zone->lock and zone->span_seqlock
730 : : */
731 : : spinlock_t node_size_lock;
732 : : #endif
733 : : unsigned long node_start_pfn;
734 : : unsigned long node_present_pages; /* total number of physical pages */
735 : : unsigned long node_spanned_pages; /* total size of physical page
736 : : range, including holes */
737 : : int node_id;
738 : : wait_queue_head_t kswapd_wait;
739 : : wait_queue_head_t pfmemalloc_wait;
740 : : struct task_struct *kswapd; /* Protected by
741 : : mem_hotplug_begin/end() */
742 : : int kswapd_order;
743 : : enum zone_type kswapd_classzone_idx;
744 : :
745 : : int kswapd_failures; /* Number of 'reclaimed == 0' runs */
746 : :
747 : : #ifdef CONFIG_COMPACTION
748 : : int kcompactd_max_order;
749 : : enum zone_type kcompactd_classzone_idx;
750 : : wait_queue_head_t kcompactd_wait;
751 : : struct task_struct *kcompactd;
752 : : #endif
753 : : /*
754 : : * This is a per-node reserve of pages that are not available
755 : : * to userspace allocations.
756 : : */
757 : : unsigned long totalreserve_pages;
758 : :
759 : : #ifdef CONFIG_NUMA
760 : : /*
761 : : * node reclaim becomes active if more unmapped pages exist.
762 : : */
763 : : unsigned long min_unmapped_pages;
764 : : unsigned long min_slab_pages;
765 : : #endif /* CONFIG_NUMA */
766 : :
767 : : /* Write-intensive fields used by page reclaim */
768 : : ZONE_PADDING(_pad1_)
769 : : spinlock_t lru_lock;
770 : :
771 : : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
772 : : /*
773 : : * If memory initialisation on large machines is deferred then this
774 : : * is the first PFN that needs to be initialised.
775 : : */
776 : : unsigned long first_deferred_pfn;
777 : : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
778 : :
779 : : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
780 : : struct deferred_split deferred_split_queue;
781 : : #endif
782 : :
783 : : /* Fields commonly accessed by the page reclaim scanner */
784 : :
785 : : /*
786 : : * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
787 : : *
788 : : * Use mem_cgroup_lruvec() to look up lruvecs.
789 : : */
790 : : struct lruvec __lruvec;
791 : :
792 : : unsigned long flags;
793 : :
794 : : ZONE_PADDING(_pad2_)
795 : :
796 : : /* Per-node vmstats */
797 : : struct per_cpu_nodestat __percpu *per_cpu_nodestats;
798 : : atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
799 : : } pg_data_t;
800 : :
801 : : #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
802 : : #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
803 : : #ifdef CONFIG_FLAT_NODE_MEM_MAP
804 : : #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
805 : : #else
806 : : #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
807 : : #endif
808 : : #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
809 : :
810 : : #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
811 : : #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
812 : :
813 : 28 : static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
814 : : {
815 : 28 : return pgdat->node_start_pfn + pgdat->node_spanned_pages;
816 : : }
817 : :
818 : : static inline bool pgdat_is_empty(pg_data_t *pgdat)
819 : : {
820 : : return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
821 : : }
822 : :
823 : : #include <linux/memory_hotplug.h>
824 : :
825 : : void build_all_zonelists(pg_data_t *pgdat);
826 : : void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
827 : : enum zone_type classzone_idx);
828 : : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
829 : : int classzone_idx, unsigned int alloc_flags,
830 : : long free_pages);
831 : : bool zone_watermark_ok(struct zone *z, unsigned int order,
832 : : unsigned long mark, int classzone_idx,
833 : : unsigned int alloc_flags);
834 : : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
835 : : unsigned long mark, int classzone_idx);
836 : : enum memmap_context {
837 : : MEMMAP_EARLY,
838 : : MEMMAP_HOTPLUG,
839 : : };
840 : : extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
841 : : unsigned long size);
842 : :
843 : : extern void lruvec_init(struct lruvec *lruvec);
844 : :
845 : 2841945 : static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
846 : : {
847 : : #ifdef CONFIG_MEMCG
848 : : return lruvec->pgdat;
849 : : #else
850 [ # # ]: 2841945 : return container_of(lruvec, struct pglist_data, __lruvec);
851 : : #endif
852 : : }
853 : :
854 : : extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
855 : :
856 : : #ifdef CONFIG_HAVE_MEMORY_PRESENT
857 : : void memory_present(int nid, unsigned long start, unsigned long end);
858 : : #else
859 : : static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
860 : : #endif
861 : :
862 : : #if defined(CONFIG_SPARSEMEM)
863 : : void memblocks_present(void);
864 : : #else
865 : : static inline void memblocks_present(void) {}
866 : : #endif
867 : :
868 : : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
869 : : int local_memory_node(int node_id);
870 : : #else
871 : 1484 : static inline int local_memory_node(int node_id) { return node_id; };
872 : : #endif
873 : :
874 : : /*
875 : : * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
876 : : */
877 : : #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
878 : :
879 : : /*
880 : : * Returns true if a zone has pages managed by the buddy allocator.
881 : : * All the reclaim decisions have to use this function rather than
882 : : * populated_zone(). If the whole zone is reserved then we can easily
883 : : * end up with populated_zone() && !managed_zone().
884 : : */
885 : 280 : static inline bool managed_zone(struct zone *zone)
886 : : {
887 : 280 : return zone_managed_pages(zone);
888 : : }
889 : :
890 : : /* Returns true if a zone has memory */
891 : 49122 : static inline bool populated_zone(struct zone *zone)
892 : : {
893 [ + + + - : 49122 : return zone->present_pages;
+ + + + -
- + + + +
- - - - -
- + + ]
894 : : }
895 : :
896 : : #ifdef CONFIG_NUMA
897 : 8395379 : static inline int zone_to_nid(struct zone *zone)
898 : : {
899 [ + - - - : 4823461 : return zone->node;
- - - - -
+ + - ]
900 : : }
901 : :
902 : 112 : static inline void zone_set_nid(struct zone *zone, int nid)
903 : : {
904 : 112 : zone->node = nid;
905 : : }
906 : : #else
907 : : static inline int zone_to_nid(struct zone *zone)
908 : : {
909 : : return 0;
910 : : }
911 : :
912 : : static inline void zone_set_nid(struct zone *zone, int nid) {}
913 : : #endif
914 : :
915 : : extern int movable_zone;
916 : :
917 : : #ifdef CONFIG_HIGHMEM
918 : : static inline int zone_movable_is_highmem(void)
919 : : {
920 : : #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
921 : : return movable_zone == ZONE_HIGHMEM;
922 : : #else
923 : : return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
924 : : #endif
925 : : }
926 : : #endif
927 : :
928 : 224 : static inline int is_highmem_idx(enum zone_type idx)
929 : : {
930 : : #ifdef CONFIG_HIGHMEM
931 : : return (idx == ZONE_HIGHMEM ||
932 : : (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
933 : : #else
934 [ + - ]: 224 : return 0;
935 : : #endif
936 : : }
937 : :
938 : : /**
939 : : * is_highmem - helper function to quickly check if a struct zone is a
940 : : * highmem zone or not. This is an attempt to keep references
941 : : * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
942 : : * @zone - pointer to struct zone variable
943 : : */
944 : 224 : static inline int is_highmem(struct zone *zone)
945 : : {
946 : : #ifdef CONFIG_HIGHMEM
947 : : return is_highmem_idx(zone_idx(zone));
948 : : #else
949 : 224 : return 0;
950 : : #endif
951 : : }
952 : :
953 : : /* These two functions are used to setup the per zone pages min values */
954 : : struct ctl_table;
955 : : int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
956 : : void __user *, size_t *, loff_t *);
957 : : int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
958 : : void __user *, size_t *, loff_t *);
959 : : int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
960 : : void __user *, size_t *, loff_t *);
961 : : extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
962 : : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
963 : : void __user *, size_t *, loff_t *);
964 : : int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
965 : : void __user *, size_t *, loff_t *);
966 : : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
967 : : void __user *, size_t *, loff_t *);
968 : : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
969 : : void __user *, size_t *, loff_t *);
970 : :
971 : : extern int numa_zonelist_order_handler(struct ctl_table *, int,
972 : : void __user *, size_t *, loff_t *);
973 : : extern char numa_zonelist_order[];
974 : : #define NUMA_ZONELIST_ORDER_LEN 16
975 : :
976 : : #ifndef CONFIG_NEED_MULTIPLE_NODES
977 : :
978 : : extern struct pglist_data contig_page_data;
979 : : #define NODE_DATA(nid) (&contig_page_data)
980 : : #define NODE_MEM_MAP(nid) mem_map
981 : :
982 : : #else /* CONFIG_NEED_MULTIPLE_NODES */
983 : :
984 : : #include <asm/mmzone.h>
985 : :
986 : : #endif /* !CONFIG_NEED_MULTIPLE_NODES */
987 : :
988 : : extern struct pglist_data *first_online_pgdat(void);
989 : : extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
990 : : extern struct zone *next_zone(struct zone *zone);
991 : :
992 : : /**
993 : : * for_each_online_pgdat - helper macro to iterate over all online nodes
994 : : * @pgdat - pointer to a pg_data_t variable
995 : : */
996 : : #define for_each_online_pgdat(pgdat) \
997 : : for (pgdat = first_online_pgdat(); \
998 : : pgdat; \
999 : : pgdat = next_online_pgdat(pgdat))
1000 : : /**
1001 : : * for_each_zone - helper macro to iterate over all memory zones
1002 : : * @zone - pointer to struct zone variable
1003 : : *
1004 : : * The user only needs to declare the zone variable, for_each_zone
1005 : : * fills it in.
1006 : : */
1007 : : #define for_each_zone(zone) \
1008 : : for (zone = (first_online_pgdat())->node_zones; \
1009 : : zone; \
1010 : : zone = next_zone(zone))
1011 : :
1012 : : #define for_each_populated_zone(zone) \
1013 : : for (zone = (first_online_pgdat())->node_zones; \
1014 : : zone; \
1015 : : zone = next_zone(zone)) \
1016 : : if (!populated_zone(zone)) \
1017 : : ; /* do nothing */ \
1018 : : else
1019 : :
1020 : 1877676 : static inline struct zone *zonelist_zone(struct zoneref *zoneref)
1021 : : {
1022 : 1877676 : return zoneref->zone;
1023 : : }
1024 : :
1025 : 9020778 : static inline int zonelist_zone_idx(struct zoneref *zoneref)
1026 : : {
1027 [ + + - - ]: 5448999 : return zoneref->zone_idx;
1028 : : }
1029 : :
1030 : 0 : static inline int zonelist_node_idx(struct zoneref *zoneref)
1031 : : {
1032 : 0 : return zone_to_nid(zoneref->zone);
1033 : : }
1034 : :
1035 : : struct zoneref *__next_zones_zonelist(struct zoneref *z,
1036 : : enum zone_type highest_zoneidx,
1037 : : nodemask_t *nodes);
1038 : :
1039 : : /**
1040 : : * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
1041 : : * @z - The cursor used as a starting point for the search
1042 : : * @highest_zoneidx - The zone index of the highest zone to return
1043 : : * @nodes - An optional nodemask to filter the zonelist with
1044 : : *
1045 : : * This function returns the next zone at or below a given zone index that is
1046 : : * within the allowed nodemask using a cursor as the starting point for the
1047 : : * search. The zoneref returned is a cursor that represents the current zone
1048 : : * being examined. It should be advanced by one before calling
1049 : : * next_zones_zonelist again.
1050 : : */
1051 : 5448951 : static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
1052 : : enum zone_type highest_zoneidx,
1053 : : nodemask_t *nodes)
1054 : : {
1055 [ - + + + : 4823050 : if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
- - - - -
- - - - -
- - - - -
- - - - -
- - - - ]
1056 : : return z;
1057 : 55 : return __next_zones_zonelist(z, highest_zoneidx, nodes);
1058 : : }
1059 : :
1060 : : /**
1061 : : * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
1062 : : * @zonelist - The zonelist to search for a suitable zone
1063 : : * @highest_zoneidx - The zone index of the highest zone to return
1064 : : * @nodes - An optional nodemask to filter the zonelist with
1065 : : * @return - Zoneref pointer for the first suitable zone found (see below)
1066 : : *
1067 : : * This function returns the first zone at or below a given zone index that is
1068 : : * within the allowed nodemask. The zoneref returned is a cursor that can be
1069 : : * used to iterate the zonelist with next_zones_zonelist by advancing it by
1070 : : * one before calling.
1071 : : *
1072 : : * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
1073 : : * never NULL). This may happen either genuinely, or due to concurrent nodemask
1074 : : * update due to cpuset modification.
1075 : : */
1076 : 4197176 : static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
1077 : : enum zone_type highest_zoneidx,
1078 : : nodemask_t *nodes)
1079 : : {
1080 [ + + + - : 4197176 : return next_zones_zonelist(zonelist->_zonerefs,
- - - - -
- - - -
- ]
1081 : : highest_zoneidx, nodes);
1082 : : }
1083 : :
1084 : : /**
1085 : : * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
1086 : : * @zone - The current zone in the iterator
1087 : : * @z - The current pointer within zonelist->_zonerefs being iterated
1088 : : * @zlist - The zonelist being iterated
1089 : : * @highidx - The zone index of the highest zone to return
1090 : : * @nodemask - Nodemask allowed by the allocator
1091 : : *
1092 : : * This iterator iterates though all zones at or below a given zone index and
1093 : : * within a given nodemask
1094 : : */
1095 : : #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1096 : : for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
1097 : : zone; \
1098 : : z = next_zones_zonelist(++z, highidx, nodemask), \
1099 : : zone = zonelist_zone(z))
1100 : :
1101 : : #define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1102 : : for (zone = z->zone; \
1103 : : zone; \
1104 : : z = next_zones_zonelist(++z, highidx, nodemask), \
1105 : : zone = zonelist_zone(z))
1106 : :
1107 : :
1108 : : /**
1109 : : * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
1110 : : * @zone - The current zone in the iterator
1111 : : * @z - The current pointer within zonelist->zones being iterated
1112 : : * @zlist - The zonelist being iterated
1113 : : * @highidx - The zone index of the highest zone to return
1114 : : *
1115 : : * This iterator iterates though all zones at or below a given zone index.
1116 : : */
1117 : : #define for_each_zone_zonelist(zone, z, zlist, highidx) \
1118 : : for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
1119 : :
1120 : : #ifdef CONFIG_SPARSEMEM
1121 : : #include <asm/sparsemem.h>
1122 : : #endif
1123 : :
1124 : : #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
1125 : : !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1126 : : static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1127 : : {
1128 : : BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
1129 : : return 0;
1130 : : }
1131 : : #endif
1132 : :
1133 : : #ifdef CONFIG_FLATMEM
1134 : : #define pfn_to_nid(pfn) (0)
1135 : : #endif
1136 : :
1137 : : #ifdef CONFIG_SPARSEMEM
1138 : :
1139 : : /*
1140 : : * SECTION_SHIFT #bits space required to store a section #
1141 : : *
1142 : : * PA_SECTION_SHIFT physical address to/from section number
1143 : : * PFN_SECTION_SHIFT pfn to/from section number
1144 : : */
1145 : : #define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
1146 : : #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
1147 : :
1148 : : #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
1149 : :
1150 : : #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
1151 : : #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
1152 : :
1153 : : #define SECTION_BLOCKFLAGS_BITS \
1154 : : ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
1155 : :
1156 : : #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
1157 : : #error Allocator MAX_ORDER exceeds SECTION_SIZE
1158 : : #endif
1159 : :
1160 : 21511989 : static inline unsigned long pfn_to_section_nr(unsigned long pfn)
1161 : : {
1162 [ + - ]: 3104 : return pfn >> PFN_SECTION_SHIFT;
1163 : : }
1164 : 224 : static inline unsigned long section_nr_to_pfn(unsigned long sec)
1165 : : {
1166 [ + - ]: 224 : return sec << PFN_SECTION_SHIFT;
1167 : : }
1168 : :
1169 : : #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
1170 : : #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1171 : :
1172 : : #define SUBSECTION_SHIFT 21
1173 : :
1174 : : #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
1175 : : #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
1176 : : #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
1177 : :
1178 : : #if SUBSECTION_SHIFT > SECTION_SIZE_BITS
1179 : : #error Subsection size exceeds section size
1180 : : #else
1181 : : #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
1182 : : #endif
1183 : :
1184 : : #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
1185 : : #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
1186 : :
1187 : : struct mem_section_usage {
1188 : : DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
1189 : : /* See declaration of similar field in struct zone */
1190 : : unsigned long pageblock_flags[0];
1191 : : };
1192 : :
1193 : : void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
1194 : :
1195 : : struct page;
1196 : : struct page_ext;
1197 : : struct mem_section {
1198 : : /*
1199 : : * This is, logically, a pointer to an array of struct
1200 : : * pages. However, it is stored with some other magic.
1201 : : * (see sparse.c::sparse_init_one_section())
1202 : : *
1203 : : * Additionally during early boot we encode node id of
1204 : : * the location of the section here to guide allocation.
1205 : : * (see sparse.c::memory_present())
1206 : : *
1207 : : * Making it a UL at least makes someone do a cast
1208 : : * before using it wrong.
1209 : : */
1210 : : unsigned long section_mem_map;
1211 : :
1212 : : struct mem_section_usage *usage;
1213 : : #ifdef CONFIG_PAGE_EXTENSION
1214 : : /*
1215 : : * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
1216 : : * section. (see page_ext.h about this.)
1217 : : */
1218 : : struct page_ext *page_ext;
1219 : : unsigned long pad;
1220 : : #endif
1221 : : /*
1222 : : * WARNING: mem_section must be a power-of-2 in size for the
1223 : : * calculation and use of SECTION_ROOT_MASK to make sense.
1224 : : */
1225 : : };
1226 : :
1227 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1228 : : #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
1229 : : #else
1230 : : #define SECTIONS_PER_ROOT 1
1231 : : #endif
1232 : :
1233 : : #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
1234 : : #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
1235 : : #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
1236 : :
1237 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1238 : : extern struct mem_section **mem_section;
1239 : : #else
1240 : : extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
1241 : : #endif
1242 : :
1243 : 2773021 : static inline unsigned long *section_to_usemap(struct mem_section *ms)
1244 : : {
1245 : 2772320 : return ms->usage->pageblock_flags;
1246 : : }
1247 : :
1248 : 12143975 : static inline struct mem_section *__nr_to_section(unsigned long nr)
1249 : : {
1250 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1251 [ + - + - : 12143527 : if (!mem_section)
+ - - - +
- + - +
- ]
1252 : : return NULL;
1253 : : #endif
1254 [ + - + - : 12143975 : if (!mem_section[SECTION_NR_TO_ROOT(nr)])
+ - + - +
- + - - +
+ - ]
1255 : : return NULL;
1256 [ + - ]: 2775595 : return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
1257 : : }
1258 : : extern unsigned long __section_nr(struct mem_section *ms);
1259 : : extern size_t mem_section_usage_size(void);
1260 : :
1261 : : /*
1262 : : * We use the lower bits of the mem_map pointer to store
1263 : : * a little bit of information. The pointer is calculated
1264 : : * as mem_map - section_nr_to_pfn(pnum). The result is
1265 : : * aligned to the minimum alignment of the two values:
1266 : : * 1. All mem_map arrays are page-aligned.
1267 : : * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
1268 : : * lowest bits. PFN_SECTION_SHIFT is arch-specific
1269 : : * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
1270 : : * worst combination is powerpc with 256k pages,
1271 : : * which results in PFN_SECTION_SHIFT equal 6.
1272 : : * To sum it up, at least 6 bits are available.
1273 : : */
1274 : : #define SECTION_MARKED_PRESENT (1UL<<0)
1275 : : #define SECTION_HAS_MEM_MAP (1UL<<1)
1276 : : #define SECTION_IS_ONLINE (1UL<<2)
1277 : : #define SECTION_IS_EARLY (1UL<<3)
1278 : : #define SECTION_MAP_LAST_BIT (1UL<<4)
1279 : : #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
1280 : : #define SECTION_NID_SHIFT 3
1281 : :
1282 : : static inline struct page *__section_mem_map_addr(struct mem_section *section)
1283 : : {
1284 : : unsigned long map = section->section_mem_map;
1285 : : map &= SECTION_MAP_MASK;
1286 : : return (struct page *)map;
1287 : : }
1288 : :
1289 : 448 : static inline int present_section(struct mem_section *section)
1290 : : {
1291 [ + - - + ]: 448 : return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
1292 : : }
1293 : :
1294 : 448 : static inline int present_section_nr(unsigned long nr)
1295 : : {
1296 [ + - ]: 448 : return present_section(__nr_to_section(nr));
1297 : : }
1298 : :
1299 : 9369330 : static inline int valid_section(struct mem_section *section)
1300 : : {
1301 [ + - ]: 9369330 : return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
1302 : : }
1303 : :
1304 : 9369330 : static inline int early_section(struct mem_section *section)
1305 : : {
1306 : 9369330 : return (section && (section->section_mem_map & SECTION_IS_EARLY));
1307 : : }
1308 : :
1309 : : static inline int valid_section_nr(unsigned long nr)
1310 : : {
1311 : : return valid_section(__nr_to_section(nr));
1312 : : }
1313 : :
1314 : : static inline int online_section(struct mem_section *section)
1315 : : {
1316 : : return (section && (section->section_mem_map & SECTION_IS_ONLINE));
1317 : : }
1318 : :
1319 : : static inline int online_section_nr(unsigned long nr)
1320 : : {
1321 : : return online_section(__nr_to_section(nr));
1322 : : }
1323 : :
1324 : : #ifdef CONFIG_MEMORY_HOTPLUG
1325 : : void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1326 : : #ifdef CONFIG_MEMORY_HOTREMOVE
1327 : : void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1328 : : #endif
1329 : : #endif
1330 : :
1331 : 2773021 : static inline struct mem_section *__pfn_to_section(unsigned long pfn)
1332 : : {
1333 [ + - + - : 2773021 : return __nr_to_section(pfn_to_section_nr(pfn));
+ - + - ]
1334 : : }
1335 : :
1336 : : extern unsigned long __highest_present_section_nr;
1337 : :
1338 : 252 : static inline int subsection_map_index(unsigned long pfn)
1339 : : {
1340 [ - + ]: 252 : return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
1341 : : }
1342 : :
1343 : : #ifdef CONFIG_SPARSEMEM_VMEMMAP
1344 : 0 : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1345 : : {
1346 : 0 : int idx = subsection_map_index(pfn);
1347 : :
1348 : 0 : return test_bit(idx, ms->usage->subsection_map);
1349 : : }
1350 : : #else
1351 : : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1352 : : {
1353 : : return 1;
1354 : : }
1355 : : #endif
1356 : :
1357 : : #ifndef CONFIG_HAVE_ARCH_PFN_VALID
1358 : 9369330 : static inline int pfn_valid(unsigned long pfn)
1359 : : {
1360 : 9369330 : struct mem_section *ms;
1361 : :
1362 [ - + - + : 18738660 : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
- ]
1363 : : return 0;
1364 [ + - ]: 9369330 : ms = __nr_to_section(pfn_to_section_nr(pfn));
1365 [ + - ]: 9369330 : if (!valid_section(ms))
1366 : : return 0;
1367 : : /*
1368 : : * Traditionally early sections always returned pfn_valid() for
1369 : : * the entire section-sized span.
1370 : : */
1371 [ - + - - : 9369330 : return early_section(ms) || pfn_section_valid(ms, pfn);
# # ]
1372 : : }
1373 : : #endif
1374 : :
1375 : : static inline int pfn_present(unsigned long pfn)
1376 : : {
1377 : : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1378 : : return 0;
1379 : : return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
1380 : : }
1381 : :
1382 : 280 : static inline unsigned long next_present_section_nr(unsigned long section_nr)
1383 : : {
1384 [ + - + - : 504 : while (++section_nr <= __highest_present_section_nr) {
+ + + - +
+ - - -
- ]
1385 [ - + + - : 448 : if (present_section_nr(section_nr))
- + + - -
+ - - -
- ]
1386 : : return section_nr;
1387 : : }
1388 : :
1389 : : return -1;
1390 : : }
1391 : :
1392 : : /*
1393 : : * These are _only_ used during initialisation, therefore they
1394 : : * can use __initdata ... They could have names to indicate
1395 : : * this restriction.
1396 : : */
1397 : : #ifdef CONFIG_NUMA
1398 : : #define pfn_to_nid(pfn) \
1399 : : ({ \
1400 : : unsigned long __pfn_to_nid_pfn = (pfn); \
1401 : : page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
1402 : : })
1403 : : #else
1404 : : #define pfn_to_nid(pfn) (0)
1405 : : #endif
1406 : :
1407 : : #define early_pfn_valid(pfn) pfn_valid(pfn)
1408 : : void sparse_init(void);
1409 : : #else
1410 : : #define sparse_init() do {} while (0)
1411 : : #define sparse_index_init(_sec, _nid) do {} while (0)
1412 : : #define pfn_present pfn_valid
1413 : : #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
1414 : : #endif /* CONFIG_SPARSEMEM */
1415 : :
1416 : : /*
1417 : : * During memory init memblocks map pfns to nids. The search is expensive and
1418 : : * this caches recent lookups. The implementation of __early_pfn_to_nid
1419 : : * may treat start/end as pfns or sections.
1420 : : */
1421 : : struct mminit_pfnnid_cache {
1422 : : unsigned long last_start;
1423 : : unsigned long last_end;
1424 : : int last_nid;
1425 : : };
1426 : :
1427 : : #ifndef early_pfn_valid
1428 : : #define early_pfn_valid(pfn) (1)
1429 : : #endif
1430 : :
1431 : : void memory_present(int nid, unsigned long start, unsigned long end);
1432 : :
1433 : : /*
1434 : : * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
1435 : : * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
1436 : : * pfn_valid_within() should be used in this case; we optimise this away
1437 : : * when we have no holes within a MAX_ORDER_NR_PAGES block.
1438 : : */
1439 : : #ifdef CONFIG_HOLES_IN_ZONE
1440 : : #define pfn_valid_within(pfn) pfn_valid(pfn)
1441 : : #else
1442 : : #define pfn_valid_within(pfn) (1)
1443 : : #endif
1444 : :
1445 : : #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
1446 : : /*
1447 : : * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
1448 : : * associated with it or not. This means that a struct page exists for this
1449 : : * pfn. The caller cannot assume the page is fully initialized in general.
1450 : : * Hotplugable pages might not have been onlined yet. pfn_to_online_page()
1451 : : * will ensure the struct page is fully online and initialized. Special pages
1452 : : * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly.
1453 : : *
1454 : : * In FLATMEM, it is expected that holes always have valid memmap as long as
1455 : : * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed
1456 : : * that a valid section has a memmap for the entire section.
1457 : : *
1458 : : * However, an ARM, and maybe other embedded architectures in the future
1459 : : * free memmap backing holes to save memory on the assumption the memmap is
1460 : : * never used. The page_zone linkages are then broken even though pfn_valid()
1461 : : * returns true. A walker of the full memmap must then do this additional
1462 : : * check to ensure the memmap they are looking at is sane by making sure
1463 : : * the zone and PFN linkages are still valid. This is expensive, but walkers
1464 : : * of the full memmap are extremely rare.
1465 : : */
1466 : : bool memmap_valid_within(unsigned long pfn,
1467 : : struct page *page, struct zone *zone);
1468 : : #else
1469 : 56 : static inline bool memmap_valid_within(unsigned long pfn,
1470 : : struct page *page, struct zone *zone)
1471 : : {
1472 [ # # ]: 56 : return true;
1473 : : }
1474 : : #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
1475 : :
1476 : : #endif /* !__GENERATING_BOUNDS.H */
1477 : : #endif /* !__ASSEMBLY__ */
1478 : : #endif /* _LINUX_MMZONE_H */
|