Branch data Line data Source code
1 : : /* SPDX-License-Identifier: GPL-2.0 */
2 : : #ifndef _LINUX_MMZONE_H
3 : : #define _LINUX_MMZONE_H
4 : :
5 : : #ifndef __ASSEMBLY__
6 : : #ifndef __GENERATING_BOUNDS_H
7 : :
8 : : #include <linux/spinlock.h>
9 : : #include <linux/list.h>
10 : : #include <linux/wait.h>
11 : : #include <linux/bitops.h>
12 : : #include <linux/cache.h>
13 : : #include <linux/threads.h>
14 : : #include <linux/numa.h>
15 : : #include <linux/init.h>
16 : : #include <linux/seqlock.h>
17 : : #include <linux/nodemask.h>
18 : : #include <linux/pageblock-flags.h>
19 : : #include <linux/page-flags-layout.h>
20 : : #include <linux/atomic.h>
21 : : #include <linux/mm_types.h>
22 : : #include <linux/page-flags.h>
23 : : #include <asm/page.h>
24 : :
25 : : /* Free memory management - zoned buddy allocator. */
26 : : #ifndef CONFIG_FORCE_MAX_ZONEORDER
27 : : #define MAX_ORDER 11
28 : : #else
29 : : #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
30 : : #endif
31 : : #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
32 : :
33 : : /*
34 : : * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
35 : : * costly to service. That is between allocation orders which should
36 : : * coalesce naturally under reasonable reclaim pressure and those which
37 : : * will not.
38 : : */
39 : : #define PAGE_ALLOC_COSTLY_ORDER 3
40 : :
41 : : enum migratetype {
42 : : MIGRATE_UNMOVABLE,
43 : : MIGRATE_MOVABLE,
44 : : MIGRATE_RECLAIMABLE,
45 : : MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
46 : : MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
47 : : #ifdef CONFIG_CMA
48 : : /*
49 : : * MIGRATE_CMA migration type is designed to mimic the way
50 : : * ZONE_MOVABLE works. Only movable pages can be allocated
51 : : * from MIGRATE_CMA pageblocks and page allocator never
52 : : * implicitly change migration type of MIGRATE_CMA pageblock.
53 : : *
54 : : * The way to use it is to change migratetype of a range of
55 : : * pageblocks to MIGRATE_CMA which can be done by
56 : : * __free_pageblock_cma() function. What is important though
57 : : * is that a range of pageblocks must be aligned to
58 : : * MAX_ORDER_NR_PAGES should biggest page be bigger then
59 : : * a single pageblock.
60 : : */
61 : : MIGRATE_CMA,
62 : : #endif
63 : : #ifdef CONFIG_MEMORY_ISOLATION
64 : : MIGRATE_ISOLATE, /* can't allocate from here */
65 : : #endif
66 : : MIGRATE_TYPES
67 : : };
68 : :
69 : : /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
70 : : extern const char * const migratetype_names[MIGRATE_TYPES];
71 : :
72 : : #ifdef CONFIG_CMA
73 : : # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
74 : : # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
75 : : #else
76 : : # define is_migrate_cma(migratetype) false
77 : : # define is_migrate_cma_page(_page) false
78 : : #endif
79 : :
80 : : static inline bool is_migrate_movable(int mt)
81 : : {
82 [ # # # # : 0 : return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
# # # # ]
83 : : }
84 : :
85 : : #define for_each_migratetype_order(order, type) \
86 : : for (order = 0; order < MAX_ORDER; order++) \
87 : : for (type = 0; type < MIGRATE_TYPES; type++)
88 : :
89 : : extern int page_group_by_mobility_disabled;
90 : :
91 : : #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
92 : : #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
93 : :
94 : : #define get_pageblock_migratetype(page) \
95 : : get_pfnblock_flags_mask(page, page_to_pfn(page), \
96 : : PB_migrate_end, MIGRATETYPE_MASK)
97 : :
98 : : struct free_area {
99 : : struct list_head free_list[MIGRATE_TYPES];
100 : : unsigned long nr_free;
101 : : };
102 : :
103 : : /* Used for pages not on another list */
104 : : static inline void add_to_free_area(struct page *page, struct free_area *area,
105 : : int migratetype)
106 : : {
107 : 27272658 : list_add(&page->lru, &area->free_list[migratetype]);
108 : 27272658 : area->nr_free++;
109 : : }
110 : :
111 : : /* Used for pages not on another list */
112 : : static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
113 : : int migratetype)
114 : : {
115 : 4011723 : list_add_tail(&page->lru, &area->free_list[migratetype]);
116 : 4011723 : area->nr_free++;
117 : : }
118 : :
119 : : #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
120 : : /* Used to preserve page allocation order entropy */
121 : : void add_to_free_area_random(struct page *page, struct free_area *area,
122 : : int migratetype);
123 : : #else
124 : : static inline void add_to_free_area_random(struct page *page,
125 : : struct free_area *area, int migratetype)
126 : : {
127 : : add_to_free_area(page, area, migratetype);
128 : : }
129 : : #endif
130 : :
131 : : /* Used for pages which are on another list */
132 : : static inline void move_to_free_area(struct page *page, struct free_area *area,
133 : : int migratetype)
134 : : {
135 : 57966 : list_move(&page->lru, &area->free_list[migratetype]);
136 : : }
137 : :
138 : 42925515 : static inline struct page *get_page_from_free_area(struct free_area *area,
139 : : int migratetype)
140 : : {
141 [ # # + + ]: 85851030 : return list_first_entry_or_null(&area->free_list[migratetype],
142 : : struct page, lru);
143 : : }
144 : :
145 : 22588380 : static inline void del_page_from_free_area(struct page *page,
146 : : struct free_area *area)
147 : : {
148 : : list_del(&page->lru);
149 : : __ClearPageBuddy(page);
150 : 31222994 : set_page_private(page, 0);
151 : 31222994 : area->nr_free--;
152 : 22588380 : }
153 : :
154 : : static inline bool free_area_empty(struct free_area *area, int migratetype)
155 : : {
156 : 1401768 : return list_empty(&area->free_list[migratetype]);
157 : : }
158 : :
159 : : struct pglist_data;
160 : :
161 : : /*
162 : : * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
163 : : * So add a wild amount of padding here to ensure that they fall into separate
164 : : * cachelines. There are very few zone structures in the machine, so space
165 : : * consumption is not a concern here.
166 : : */
167 : : #if defined(CONFIG_SMP)
168 : : struct zone_padding {
169 : : char x[0];
170 : : } ____cacheline_internodealigned_in_smp;
171 : : #define ZONE_PADDING(name) struct zone_padding name;
172 : : #else
173 : : #define ZONE_PADDING(name)
174 : : #endif
175 : :
176 : : #ifdef CONFIG_NUMA
177 : : enum numa_stat_item {
178 : : NUMA_HIT, /* allocated in intended node */
179 : : NUMA_MISS, /* allocated in non intended node */
180 : : NUMA_FOREIGN, /* was intended here, hit elsewhere */
181 : : NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
182 : : NUMA_LOCAL, /* allocation from local node */
183 : : NUMA_OTHER, /* allocation from other node */
184 : : NR_VM_NUMA_STAT_ITEMS
185 : : };
186 : : #else
187 : : #define NR_VM_NUMA_STAT_ITEMS 0
188 : : #endif
189 : :
190 : : enum zone_stat_item {
191 : : /* First 128 byte cacheline (assuming 64 bit words) */
192 : : NR_FREE_PAGES,
193 : : NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
194 : : NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
195 : : NR_ZONE_ACTIVE_ANON,
196 : : NR_ZONE_INACTIVE_FILE,
197 : : NR_ZONE_ACTIVE_FILE,
198 : : NR_ZONE_UNEVICTABLE,
199 : : NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
200 : : NR_MLOCK, /* mlock()ed pages found and moved off LRU */
201 : : NR_PAGETABLE, /* used for pagetables */
202 : : NR_KERNEL_STACK_KB, /* measured in KiB */
203 : : /* Second 128 byte cacheline */
204 : : NR_BOUNCE,
205 : : #if IS_ENABLED(CONFIG_ZSMALLOC)
206 : : NR_ZSPAGES, /* allocated in zsmalloc */
207 : : #endif
208 : : NR_FREE_CMA_PAGES,
209 : : NR_VM_ZONE_STAT_ITEMS };
210 : :
211 : : enum node_stat_item {
212 : : NR_LRU_BASE,
213 : : NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
214 : : NR_ACTIVE_ANON, /* " " " " " */
215 : : NR_INACTIVE_FILE, /* " " " " " */
216 : : NR_ACTIVE_FILE, /* " " " " " */
217 : : NR_UNEVICTABLE, /* " " " " " */
218 : : NR_SLAB_RECLAIMABLE,
219 : : NR_SLAB_UNRECLAIMABLE,
220 : : NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
221 : : NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
222 : : WORKINGSET_NODES,
223 : : WORKINGSET_REFAULT,
224 : : WORKINGSET_ACTIVATE,
225 : : WORKINGSET_RESTORE,
226 : : WORKINGSET_NODERECLAIM,
227 : : NR_ANON_MAPPED, /* Mapped anonymous pages */
228 : : NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
229 : : only modified from process context */
230 : : NR_FILE_PAGES,
231 : : NR_FILE_DIRTY,
232 : : NR_WRITEBACK,
233 : : NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
234 : : NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
235 : : NR_SHMEM_THPS,
236 : : NR_SHMEM_PMDMAPPED,
237 : : NR_FILE_THPS,
238 : : NR_FILE_PMDMAPPED,
239 : : NR_ANON_THPS,
240 : : NR_UNSTABLE_NFS, /* NFS unstable pages */
241 : : NR_VMSCAN_WRITE,
242 : : NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
243 : : NR_DIRTIED, /* page dirtyings since bootup */
244 : : NR_WRITTEN, /* page writings since bootup */
245 : : NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
246 : : NR_VM_NODE_STAT_ITEMS
247 : : };
248 : :
249 : : /*
250 : : * We do arithmetic on the LRU lists in various places in the code,
251 : : * so it is important to keep the active lists LRU_ACTIVE higher in
252 : : * the array than the corresponding inactive lists, and to keep
253 : : * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
254 : : *
255 : : * This has to be kept in sync with the statistics in zone_stat_item
256 : : * above and the descriptions in vmstat_text in mm/vmstat.c
257 : : */
258 : : #define LRU_BASE 0
259 : : #define LRU_ACTIVE 1
260 : : #define LRU_FILE 2
261 : :
262 : : enum lru_list {
263 : : LRU_INACTIVE_ANON = LRU_BASE,
264 : : LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
265 : : LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
266 : : LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
267 : : LRU_UNEVICTABLE,
268 : : NR_LRU_LISTS
269 : : };
270 : :
271 : : #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
272 : :
273 : : #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
274 : :
275 : : static inline int is_file_lru(enum lru_list lru)
276 : : {
277 : 0 : return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
278 : : }
279 : :
280 : : static inline int is_active_lru(enum lru_list lru)
281 : : {
282 : 0 : return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
283 : : }
284 : :
285 : : struct zone_reclaim_stat {
286 : : /*
287 : : * The pageout code in vmscan.c keeps track of how many of the
288 : : * mem/swap backed and file backed pages are referenced.
289 : : * The higher the rotated/scanned ratio, the more valuable
290 : : * that cache is.
291 : : *
292 : : * The anon LRU stats live in [0], file LRU stats in [1]
293 : : */
294 : : unsigned long recent_rotated[2];
295 : : unsigned long recent_scanned[2];
296 : : };
297 : :
298 : : struct lruvec {
299 : : struct list_head lists[NR_LRU_LISTS];
300 : : struct zone_reclaim_stat reclaim_stat;
301 : : /* Evictions & activations on the inactive file list */
302 : : atomic_long_t inactive_age;
303 : : /* Refaults at the time of last reclaim cycle */
304 : : unsigned long refaults;
305 : : #ifdef CONFIG_MEMCG
306 : : struct pglist_data *pgdat;
307 : : #endif
308 : : };
309 : :
310 : : /* Isolate unmapped file */
311 : : #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
312 : : /* Isolate for asynchronous migration */
313 : : #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
314 : : /* Isolate unevictable pages */
315 : : #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8)
316 : :
317 : : /* LRU Isolation modes. */
318 : : typedef unsigned __bitwise isolate_mode_t;
319 : :
320 : : enum zone_watermarks {
321 : : WMARK_MIN,
322 : : WMARK_LOW,
323 : : WMARK_HIGH,
324 : : NR_WMARK
325 : : };
326 : :
327 : : #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
328 : : #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
329 : : #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
330 : : #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
331 : :
332 : : struct per_cpu_pages {
333 : : int count; /* number of pages in the list */
334 : : int high; /* high watermark, emptying needed */
335 : : int batch; /* chunk size for buddy add/remove */
336 : :
337 : : /* Lists of pages, one per migrate type stored on the pcp-lists */
338 : : struct list_head lists[MIGRATE_PCPTYPES];
339 : : };
340 : :
341 : : struct per_cpu_pageset {
342 : : struct per_cpu_pages pcp;
343 : : #ifdef CONFIG_NUMA
344 : : s8 expire;
345 : : u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
346 : : #endif
347 : : #ifdef CONFIG_SMP
348 : : s8 stat_threshold;
349 : : s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
350 : : #endif
351 : : };
352 : :
353 : : struct per_cpu_nodestat {
354 : : s8 stat_threshold;
355 : : s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
356 : : };
357 : :
358 : : #endif /* !__GENERATING_BOUNDS.H */
359 : :
360 : : enum zone_type {
361 : : /*
362 : : * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
363 : : * to DMA to all of the addressable memory (ZONE_NORMAL).
364 : : * On architectures where this area covers the whole 32 bit address
365 : : * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
366 : : * DMA addressing constraints. This distinction is important as a 32bit
367 : : * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
368 : : * platforms may need both zones as they support peripherals with
369 : : * different DMA addressing limitations.
370 : : *
371 : : * Some examples:
372 : : *
373 : : * - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the
374 : : * rest of the lower 4G.
375 : : *
376 : : * - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on
377 : : * the specific device.
378 : : *
379 : : * - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the
380 : : * lower 4G.
381 : : *
382 : : * - powerpc only uses ZONE_DMA, the size, up to 2G, may vary
383 : : * depending on the specific device.
384 : : *
385 : : * - s390 uses ZONE_DMA fixed to the lower 2G.
386 : : *
387 : : * - ia64 and riscv only use ZONE_DMA32.
388 : : *
389 : : * - parisc uses neither.
390 : : */
391 : : #ifdef CONFIG_ZONE_DMA
392 : : ZONE_DMA,
393 : : #endif
394 : : #ifdef CONFIG_ZONE_DMA32
395 : : ZONE_DMA32,
396 : : #endif
397 : : /*
398 : : * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
399 : : * performed on pages in ZONE_NORMAL if the DMA devices support
400 : : * transfers to all addressable memory.
401 : : */
402 : : ZONE_NORMAL,
403 : : #ifdef CONFIG_HIGHMEM
404 : : /*
405 : : * A memory area that is only addressable by the kernel through
406 : : * mapping portions into its own address space. This is for example
407 : : * used by i386 to allow the kernel to address the memory beyond
408 : : * 900MB. The kernel will set up special mappings (page
409 : : * table entries on i386) for each page that the kernel needs to
410 : : * access.
411 : : */
412 : : ZONE_HIGHMEM,
413 : : #endif
414 : : ZONE_MOVABLE,
415 : : #ifdef CONFIG_ZONE_DEVICE
416 : : ZONE_DEVICE,
417 : : #endif
418 : : __MAX_NR_ZONES
419 : :
420 : : };
421 : :
422 : : #ifndef __GENERATING_BOUNDS_H
423 : :
424 : : struct zone {
425 : : /* Read-mostly fields */
426 : :
427 : : /* zone watermarks, access with *_wmark_pages(zone) macros */
428 : : unsigned long _watermark[NR_WMARK];
429 : : unsigned long watermark_boost;
430 : :
431 : : unsigned long nr_reserved_highatomic;
432 : :
433 : : /*
434 : : * We don't know if the memory that we're going to allocate will be
435 : : * freeable or/and it will be released eventually, so to avoid totally
436 : : * wasting several GB of ram we must reserve some of the lower zone
437 : : * memory (otherwise we risk to run OOM on the lower zones despite
438 : : * there being tons of freeable ram on the higher zones). This array is
439 : : * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
440 : : * changes.
441 : : */
442 : : long lowmem_reserve[MAX_NR_ZONES];
443 : :
444 : : #ifdef CONFIG_NUMA
445 : : int node;
446 : : #endif
447 : : struct pglist_data *zone_pgdat;
448 : : struct per_cpu_pageset __percpu *pageset;
449 : :
450 : : #ifndef CONFIG_SPARSEMEM
451 : : /*
452 : : * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
453 : : * In SPARSEMEM, this map is stored in struct mem_section
454 : : */
455 : : unsigned long *pageblock_flags;
456 : : #endif /* CONFIG_SPARSEMEM */
457 : :
458 : : /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
459 : : unsigned long zone_start_pfn;
460 : :
461 : : /*
462 : : * spanned_pages is the total pages spanned by the zone, including
463 : : * holes, which is calculated as:
464 : : * spanned_pages = zone_end_pfn - zone_start_pfn;
465 : : *
466 : : * present_pages is physical pages existing within the zone, which
467 : : * is calculated as:
468 : : * present_pages = spanned_pages - absent_pages(pages in holes);
469 : : *
470 : : * managed_pages is present pages managed by the buddy system, which
471 : : * is calculated as (reserved_pages includes pages allocated by the
472 : : * bootmem allocator):
473 : : * managed_pages = present_pages - reserved_pages;
474 : : *
475 : : * So present_pages may be used by memory hotplug or memory power
476 : : * management logic to figure out unmanaged pages by checking
477 : : * (present_pages - managed_pages). And managed_pages should be used
478 : : * by page allocator and vm scanner to calculate all kinds of watermarks
479 : : * and thresholds.
480 : : *
481 : : * Locking rules:
482 : : *
483 : : * zone_start_pfn and spanned_pages are protected by span_seqlock.
484 : : * It is a seqlock because it has to be read outside of zone->lock,
485 : : * and it is done in the main allocator path. But, it is written
486 : : * quite infrequently.
487 : : *
488 : : * The span_seq lock is declared along with zone->lock because it is
489 : : * frequently read in proximity to zone->lock. It's good to
490 : : * give them a chance of being in the same cacheline.
491 : : *
492 : : * Write access to present_pages at runtime should be protected by
493 : : * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
494 : : * present_pages should get_online_mems() to get a stable value.
495 : : */
496 : : atomic_long_t managed_pages;
497 : : unsigned long spanned_pages;
498 : : unsigned long present_pages;
499 : :
500 : : const char *name;
501 : :
502 : : #ifdef CONFIG_MEMORY_ISOLATION
503 : : /*
504 : : * Number of isolated pageblock. It is used to solve incorrect
505 : : * freepage counting problem due to racy retrieving migratetype
506 : : * of pageblock. Protected by zone->lock.
507 : : */
508 : : unsigned long nr_isolate_pageblock;
509 : : #endif
510 : :
511 : : #ifdef CONFIG_MEMORY_HOTPLUG
512 : : /* see spanned/present_pages for more description */
513 : : seqlock_t span_seqlock;
514 : : #endif
515 : :
516 : : int initialized;
517 : :
518 : : /* Write-intensive fields used from the page allocator */
519 : : ZONE_PADDING(_pad1_)
520 : :
521 : : /* free areas of different sizes */
522 : : struct free_area free_area[MAX_ORDER];
523 : :
524 : : /* zone flags, see below */
525 : : unsigned long flags;
526 : :
527 : : /* Primarily protects free_area */
528 : : spinlock_t lock;
529 : :
530 : : /* Write-intensive fields used by compaction and vmstats. */
531 : : ZONE_PADDING(_pad2_)
532 : :
533 : : /*
534 : : * When free pages are below this point, additional steps are taken
535 : : * when reading the number of free pages to avoid per-cpu counter
536 : : * drift allowing watermarks to be breached
537 : : */
538 : : unsigned long percpu_drift_mark;
539 : :
540 : : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
541 : : /* pfn where compaction free scanner should start */
542 : : unsigned long compact_cached_free_pfn;
543 : : /* pfn where async and sync compaction migration scanner should start */
544 : : unsigned long compact_cached_migrate_pfn[2];
545 : : unsigned long compact_init_migrate_pfn;
546 : : unsigned long compact_init_free_pfn;
547 : : #endif
548 : :
549 : : #ifdef CONFIG_COMPACTION
550 : : /*
551 : : * On compaction failure, 1<<compact_defer_shift compactions
552 : : * are skipped before trying again. The number attempted since
553 : : * last failure is tracked with compact_considered.
554 : : */
555 : : unsigned int compact_considered;
556 : : unsigned int compact_defer_shift;
557 : : int compact_order_failed;
558 : : #endif
559 : :
560 : : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
561 : : /* Set to true when the PG_migrate_skip bits should be cleared */
562 : : bool compact_blockskip_flush;
563 : : #endif
564 : :
565 : : bool contiguous;
566 : :
567 : : ZONE_PADDING(_pad3_)
568 : : /* Zone statistics */
569 : : atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
570 : : atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
571 : : } ____cacheline_internodealigned_in_smp;
572 : :
573 : : enum pgdat_flags {
574 : : PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
575 : : * a congested BDI
576 : : */
577 : : PGDAT_DIRTY, /* reclaim scanning has recently found
578 : : * many dirty file pages at the tail
579 : : * of the LRU.
580 : : */
581 : : PGDAT_WRITEBACK, /* reclaim scanning has recently found
582 : : * many pages under writeback
583 : : */
584 : : PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
585 : : };
586 : :
587 : : enum zone_flags {
588 : : ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
589 : : * Cleared when kswapd is woken.
590 : : */
591 : : };
592 : :
593 : : static inline unsigned long zone_managed_pages(struct zone *zone)
594 : : {
595 : : return (unsigned long)atomic_long_read(&zone->managed_pages);
596 : : }
597 : :
598 : : static inline unsigned long zone_end_pfn(const struct zone *zone)
599 : : {
600 : 65826 : return zone->zone_start_pfn + zone->spanned_pages;
601 : : }
602 : :
603 : : static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
604 : : {
605 [ + - - + : 33120 : return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
+ - - + ]
606 : : }
607 : :
608 : : static inline bool zone_is_initialized(struct zone *zone)
609 : : {
610 : : return zone->initialized;
611 : : }
612 : :
613 : : static inline bool zone_is_empty(struct zone *zone)
614 : : {
615 : : return zone->spanned_pages == 0;
616 : : }
617 : :
618 : : /*
619 : : * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
620 : : * intersection with the given zone
621 : : */
622 : : static inline bool zone_intersects(struct zone *zone,
623 : : unsigned long start_pfn, unsigned long nr_pages)
624 : : {
625 : : if (zone_is_empty(zone))
626 : : return false;
627 : : if (start_pfn >= zone_end_pfn(zone) ||
628 : : start_pfn + nr_pages <= zone->zone_start_pfn)
629 : : return false;
630 : :
631 : : return true;
632 : : }
633 : :
634 : : /*
635 : : * The "priority" of VM scanning is how much of the queues we will scan in one
636 : : * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
637 : : * queues ("queue_length >> 12") during an aging round.
638 : : */
639 : : #define DEF_PRIORITY 12
640 : :
641 : : /* Maximum number of zones on a zonelist */
642 : : #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
643 : :
644 : : enum {
645 : : ZONELIST_FALLBACK, /* zonelist with fallback */
646 : : #ifdef CONFIG_NUMA
647 : : /*
648 : : * The NUMA zonelists are doubled because we need zonelists that
649 : : * restrict the allocations to a single node for __GFP_THISNODE.
650 : : */
651 : : ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
652 : : #endif
653 : : MAX_ZONELISTS
654 : : };
655 : :
656 : : /*
657 : : * This struct contains information about a zone in a zonelist. It is stored
658 : : * here to avoid dereferences into large structures and lookups of tables
659 : : */
660 : : struct zoneref {
661 : : struct zone *zone; /* Pointer to actual zone */
662 : : int zone_idx; /* zone_idx(zoneref->zone) */
663 : : };
664 : :
665 : : /*
666 : : * One allocation request operates on a zonelist. A zonelist
667 : : * is a list of zones, the first one is the 'goal' of the
668 : : * allocation, the other zones are fallback zones, in decreasing
669 : : * priority.
670 : : *
671 : : * To speed the reading of the zonelist, the zonerefs contain the zone index
672 : : * of the entry being read. Helper functions to access information given
673 : : * a struct zoneref are
674 : : *
675 : : * zonelist_zone() - Return the struct zone * for an entry in _zonerefs
676 : : * zonelist_zone_idx() - Return the index of the zone for an entry
677 : : * zonelist_node_idx() - Return the index of the node for an entry
678 : : */
679 : : struct zonelist {
680 : : struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
681 : : };
682 : :
683 : : #ifndef CONFIG_DISCONTIGMEM
684 : : /* The array of struct pages - for discontigmem use pgdat->lmem_map */
685 : : extern struct page *mem_map;
686 : : #endif
687 : :
688 : : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
689 : : struct deferred_split {
690 : : spinlock_t split_queue_lock;
691 : : struct list_head split_queue;
692 : : unsigned long split_queue_len;
693 : : };
694 : : #endif
695 : :
696 : : /*
697 : : * On NUMA machines, each NUMA node would have a pg_data_t to describe
698 : : * it's memory layout. On UMA machines there is a single pglist_data which
699 : : * describes the whole memory.
700 : : *
701 : : * Memory statistics and page replacement data structures are maintained on a
702 : : * per-zone basis.
703 : : */
704 : : struct bootmem_data;
705 : : typedef struct pglist_data {
706 : : struct zone node_zones[MAX_NR_ZONES];
707 : : struct zonelist node_zonelists[MAX_ZONELISTS];
708 : : int nr_zones;
709 : : #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
710 : : struct page *node_mem_map;
711 : : #ifdef CONFIG_PAGE_EXTENSION
712 : : struct page_ext *node_page_ext;
713 : : #endif
714 : : #endif
715 : : #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
716 : : /*
717 : : * Must be held any time you expect node_start_pfn,
718 : : * node_present_pages, node_spanned_pages or nr_zones to stay constant.
719 : : * Also synchronizes pgdat->first_deferred_pfn during deferred page
720 : : * init.
721 : : *
722 : : * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
723 : : * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
724 : : * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
725 : : *
726 : : * Nests above zone->lock and zone->span_seqlock
727 : : */
728 : : spinlock_t node_size_lock;
729 : : #endif
730 : : unsigned long node_start_pfn;
731 : : unsigned long node_present_pages; /* total number of physical pages */
732 : : unsigned long node_spanned_pages; /* total size of physical page
733 : : range, including holes */
734 : : int node_id;
735 : : wait_queue_head_t kswapd_wait;
736 : : wait_queue_head_t pfmemalloc_wait;
737 : : struct task_struct *kswapd; /* Protected by
738 : : mem_hotplug_begin/end() */
739 : : int kswapd_order;
740 : : enum zone_type kswapd_classzone_idx;
741 : :
742 : : int kswapd_failures; /* Number of 'reclaimed == 0' runs */
743 : :
744 : : #ifdef CONFIG_COMPACTION
745 : : int kcompactd_max_order;
746 : : enum zone_type kcompactd_classzone_idx;
747 : : wait_queue_head_t kcompactd_wait;
748 : : struct task_struct *kcompactd;
749 : : #endif
750 : : /*
751 : : * This is a per-node reserve of pages that are not available
752 : : * to userspace allocations.
753 : : */
754 : : unsigned long totalreserve_pages;
755 : :
756 : : #ifdef CONFIG_NUMA
757 : : /*
758 : : * zone reclaim becomes active if more unmapped pages exist.
759 : : */
760 : : unsigned long min_unmapped_pages;
761 : : unsigned long min_slab_pages;
762 : : #endif /* CONFIG_NUMA */
763 : :
764 : : /* Write-intensive fields used by page reclaim */
765 : : ZONE_PADDING(_pad1_)
766 : : spinlock_t lru_lock;
767 : :
768 : : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
769 : : /*
770 : : * If memory initialisation on large machines is deferred then this
771 : : * is the first PFN that needs to be initialised.
772 : : */
773 : : unsigned long first_deferred_pfn;
774 : : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
775 : :
776 : : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
777 : : struct deferred_split deferred_split_queue;
778 : : #endif
779 : :
780 : : /* Fields commonly accessed by the page reclaim scanner */
781 : : struct lruvec lruvec;
782 : :
783 : : unsigned long flags;
784 : :
785 : : ZONE_PADDING(_pad2_)
786 : :
787 : : /* Per-node vmstats */
788 : : struct per_cpu_nodestat __percpu *per_cpu_nodestats;
789 : : atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
790 : : } pg_data_t;
791 : :
792 : : #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
793 : : #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
794 : : #ifdef CONFIG_FLAT_NODE_MEM_MAP
795 : : #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
796 : : #else
797 : : #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
798 : : #endif
799 : : #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
800 : :
801 : : #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
802 : : #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
803 : :
804 : : static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
805 : : {
806 : 207 : return &pgdat->lruvec;
807 : : }
808 : :
809 : : static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
810 : : {
811 : 207 : return pgdat->node_start_pfn + pgdat->node_spanned_pages;
812 : : }
813 : :
814 : : static inline bool pgdat_is_empty(pg_data_t *pgdat)
815 : : {
816 : : return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
817 : : }
818 : :
819 : : #include <linux/memory_hotplug.h>
820 : :
821 : : void build_all_zonelists(pg_data_t *pgdat);
822 : : void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
823 : : enum zone_type classzone_idx);
824 : : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
825 : : int classzone_idx, unsigned int alloc_flags,
826 : : long free_pages);
827 : : bool zone_watermark_ok(struct zone *z, unsigned int order,
828 : : unsigned long mark, int classzone_idx,
829 : : unsigned int alloc_flags);
830 : : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
831 : : unsigned long mark, int classzone_idx);
832 : : enum memmap_context {
833 : : MEMMAP_EARLY,
834 : : MEMMAP_HOTPLUG,
835 : : };
836 : : extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
837 : : unsigned long size);
838 : :
839 : : extern void lruvec_init(struct lruvec *lruvec);
840 : :
841 : 50595873 : static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
842 : : {
843 : : #ifdef CONFIG_MEMCG
844 : 102253677 : return lruvec->pgdat;
845 : : #else
846 : : return container_of(lruvec, struct pglist_data, lruvec);
847 : : #endif
848 : : }
849 : :
850 : : extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
851 : :
852 : : #ifdef CONFIG_HAVE_MEMORY_PRESENT
853 : : void memory_present(int nid, unsigned long start, unsigned long end);
854 : : #else
855 : : static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
856 : : #endif
857 : :
858 : : #if defined(CONFIG_SPARSEMEM)
859 : : void memblocks_present(void);
860 : : #else
861 : : static inline void memblocks_present(void) {}
862 : : #endif
863 : :
864 : : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
865 : : int local_memory_node(int node_id);
866 : : #else
867 : : static inline int local_memory_node(int node_id) { return node_id; };
868 : : #endif
869 : :
870 : : /*
871 : : * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
872 : : */
873 : : #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
874 : :
875 : : /*
876 : : * Returns true if a zone has pages managed by the buddy allocator.
877 : : * All the reclaim decisions have to use this function rather than
878 : : * populated_zone(). If the whole zone is reserved then we can easily
879 : : * end up with populated_zone() && !managed_zone().
880 : : */
881 : : static inline bool managed_zone(struct zone *zone)
882 : : {
883 : : return zone_managed_pages(zone);
884 : : }
885 : :
886 : : /* Returns true if a zone has memory */
887 : : static inline bool populated_zone(struct zone *zone)
888 : : {
889 : 1441999 : return zone->present_pages;
890 : : }
891 : :
892 : : #ifdef CONFIG_NUMA
893 : : static inline int zone_to_nid(struct zone *zone)
894 : : {
895 : : return zone->node;
896 : : }
897 : :
898 : : static inline void zone_set_nid(struct zone *zone, int nid)
899 : : {
900 : : zone->node = nid;
901 : : }
902 : : #else
903 : : static inline int zone_to_nid(struct zone *zone)
904 : : {
905 : : return 0;
906 : : }
907 : :
908 : : static inline void zone_set_nid(struct zone *zone, int nid) {}
909 : : #endif
910 : :
911 : : extern int movable_zone;
912 : :
913 : : #ifdef CONFIG_HIGHMEM
914 : : static inline int zone_movable_is_highmem(void)
915 : : {
916 : : #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
917 : : return movable_zone == ZONE_HIGHMEM;
918 : : #else
919 : : return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
920 : : #endif
921 : : }
922 : : #endif
923 : :
924 : : static inline int is_highmem_idx(enum zone_type idx)
925 : : {
926 : : #ifdef CONFIG_HIGHMEM
927 : : return (idx == ZONE_HIGHMEM ||
928 : : (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
929 : : #else
930 : : return 0;
931 : : #endif
932 : : }
933 : :
934 : : /**
935 : : * is_highmem - helper function to quickly check if a struct zone is a
936 : : * highmem zone or not. This is an attempt to keep references
937 : : * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
938 : : * @zone - pointer to struct zone variable
939 : : */
940 : : static inline int is_highmem(struct zone *zone)
941 : : {
942 : : #ifdef CONFIG_HIGHMEM
943 : : return is_highmem_idx(zone_idx(zone));
944 : : #else
945 : : return 0;
946 : : #endif
947 : : }
948 : :
949 : : /* These two functions are used to setup the per zone pages min values */
950 : : struct ctl_table;
951 : : int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
952 : : void __user *, size_t *, loff_t *);
953 : : int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
954 : : void __user *, size_t *, loff_t *);
955 : : int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
956 : : void __user *, size_t *, loff_t *);
957 : : extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
958 : : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
959 : : void __user *, size_t *, loff_t *);
960 : : int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
961 : : void __user *, size_t *, loff_t *);
962 : : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
963 : : void __user *, size_t *, loff_t *);
964 : : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
965 : : void __user *, size_t *, loff_t *);
966 : :
967 : : extern int numa_zonelist_order_handler(struct ctl_table *, int,
968 : : void __user *, size_t *, loff_t *);
969 : : extern char numa_zonelist_order[];
970 : : #define NUMA_ZONELIST_ORDER_LEN 16
971 : :
972 : : #ifndef CONFIG_NEED_MULTIPLE_NODES
973 : :
974 : : extern struct pglist_data contig_page_data;
975 : : #define NODE_DATA(nid) (&contig_page_data)
976 : : #define NODE_MEM_MAP(nid) mem_map
977 : :
978 : : #else /* CONFIG_NEED_MULTIPLE_NODES */
979 : :
980 : : #include <asm/mmzone.h>
981 : :
982 : : #endif /* !CONFIG_NEED_MULTIPLE_NODES */
983 : :
984 : : extern struct pglist_data *first_online_pgdat(void);
985 : : extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
986 : : extern struct zone *next_zone(struct zone *zone);
987 : :
988 : : /**
989 : : * for_each_online_pgdat - helper macro to iterate over all online nodes
990 : : * @pgdat - pointer to a pg_data_t variable
991 : : */
992 : : #define for_each_online_pgdat(pgdat) \
993 : : for (pgdat = first_online_pgdat(); \
994 : : pgdat; \
995 : : pgdat = next_online_pgdat(pgdat))
996 : : /**
997 : : * for_each_zone - helper macro to iterate over all memory zones
998 : : * @zone - pointer to struct zone variable
999 : : *
1000 : : * The user only needs to declare the zone variable, for_each_zone
1001 : : * fills it in.
1002 : : */
1003 : : #define for_each_zone(zone) \
1004 : : for (zone = (first_online_pgdat())->node_zones; \
1005 : : zone; \
1006 : : zone = next_zone(zone))
1007 : :
1008 : : #define for_each_populated_zone(zone) \
1009 : : for (zone = (first_online_pgdat())->node_zones; \
1010 : : zone; \
1011 : : zone = next_zone(zone)) \
1012 : : if (!populated_zone(zone)) \
1013 : : ; /* do nothing */ \
1014 : : else
1015 : :
1016 : : static inline struct zone *zonelist_zone(struct zoneref *zoneref)
1017 : : {
1018 : 2484 : return zoneref->zone;
1019 : : }
1020 : :
1021 : 38642063 : static inline int zonelist_zone_idx(struct zoneref *zoneref)
1022 : : {
1023 : 77292723 : return zoneref->zone_idx;
1024 : : }
1025 : :
1026 : : static inline int zonelist_node_idx(struct zoneref *zoneref)
1027 : : {
1028 : : return zone_to_nid(zoneref->zone);
1029 : : }
1030 : :
1031 : : struct zoneref *__next_zones_zonelist(struct zoneref *z,
1032 : : enum zone_type highest_zoneidx,
1033 : : nodemask_t *nodes);
1034 : :
1035 : : /**
1036 : : * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
1037 : : * @z - The cursor used as a starting point for the search
1038 : : * @highest_zoneidx - The zone index of the highest zone to return
1039 : : * @nodes - An optional nodemask to filter the zonelist with
1040 : : *
1041 : : * This function returns the next zone at or below a given zone index that is
1042 : : * within the allowed nodemask using a cursor as the starting point for the
1043 : : * search. The zoneref returned is a cursor that represents the current zone
1044 : : * being examined. It should be advanced by one before calling
1045 : : * next_zones_zonelist again.
1046 : : */
1047 : : static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
1048 : : enum zone_type highest_zoneidx,
1049 : : nodemask_t *nodes)
1050 : : {
1051 [ - + # # : 38674095 : if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
# # # # #
# - + # #
# # # # +
+ - + ]
1052 : : return z;
1053 : 24345 : return __next_zones_zonelist(z, highest_zoneidx, nodes);
1054 : : }
1055 : :
1056 : : /**
1057 : : * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
1058 : : * @zonelist - The zonelist to search for a suitable zone
1059 : : * @highest_zoneidx - The zone index of the highest zone to return
1060 : : * @nodes - An optional nodemask to filter the zonelist with
1061 : : * @return - Zoneref pointer for the first suitable zone found (see below)
1062 : : *
1063 : : * This function returns the first zone at or below a given zone index that is
1064 : : * within the allowed nodemask. The zoneref returned is a cursor that can be
1065 : : * used to iterate the zonelist with next_zones_zonelist by advancing it by
1066 : : * one before calling.
1067 : : *
1068 : : * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
1069 : : * never NULL). This may happen either genuinely, or due to concurrent nodemask
1070 : : * update due to cpuset modification.
1071 : : */
1072 : 38667257 : static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
1073 : : enum zone_type highest_zoneidx,
1074 : : nodemask_t *nodes)
1075 : : {
1076 : 77305623 : return next_zones_zonelist(zonelist->_zonerefs,
1077 : : highest_zoneidx, nodes);
1078 : : }
1079 : :
1080 : : /**
1081 : : * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
1082 : : * @zone - The current zone in the iterator
1083 : : * @z - The current pointer within zonelist->zones being iterated
1084 : : * @zlist - The zonelist being iterated
1085 : : * @highidx - The zone index of the highest zone to return
1086 : : * @nodemask - Nodemask allowed by the allocator
1087 : : *
1088 : : * This iterator iterates though all zones at or below a given zone index and
1089 : : * within a given nodemask
1090 : : */
1091 : : #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1092 : : for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
1093 : : zone; \
1094 : : z = next_zones_zonelist(++z, highidx, nodemask), \
1095 : : zone = zonelist_zone(z))
1096 : :
1097 : : #define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1098 : : for (zone = z->zone; \
1099 : : zone; \
1100 : : z = next_zones_zonelist(++z, highidx, nodemask), \
1101 : : zone = zonelist_zone(z))
1102 : :
1103 : :
1104 : : /**
1105 : : * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
1106 : : * @zone - The current zone in the iterator
1107 : : * @z - The current pointer within zonelist->zones being iterated
1108 : : * @zlist - The zonelist being iterated
1109 : : * @highidx - The zone index of the highest zone to return
1110 : : *
1111 : : * This iterator iterates though all zones at or below a given zone index.
1112 : : */
1113 : : #define for_each_zone_zonelist(zone, z, zlist, highidx) \
1114 : : for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
1115 : :
1116 : : #ifdef CONFIG_SPARSEMEM
1117 : : #include <asm/sparsemem.h>
1118 : : #endif
1119 : :
1120 : : #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
1121 : : !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1122 : : static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1123 : : {
1124 : : BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
1125 : : return 0;
1126 : : }
1127 : : #endif
1128 : :
1129 : : #ifdef CONFIG_FLATMEM
1130 : : #define pfn_to_nid(pfn) (0)
1131 : : #endif
1132 : :
1133 : : #ifdef CONFIG_SPARSEMEM
1134 : :
1135 : : /*
1136 : : * SECTION_SHIFT #bits space required to store a section #
1137 : : *
1138 : : * PA_SECTION_SHIFT physical address to/from section number
1139 : : * PFN_SECTION_SHIFT pfn to/from section number
1140 : : */
1141 : : #define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
1142 : : #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
1143 : :
1144 : : #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
1145 : :
1146 : : #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
1147 : : #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
1148 : :
1149 : : #define SECTION_BLOCKFLAGS_BITS \
1150 : : ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
1151 : :
1152 : : #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
1153 : : #error Allocator MAX_ORDER exceeds SECTION_SIZE
1154 : : #endif
1155 : :
1156 : : static inline unsigned long pfn_to_section_nr(unsigned long pfn)
1157 : : {
1158 : : return pfn >> PFN_SECTION_SHIFT;
1159 : : }
1160 : : static inline unsigned long section_nr_to_pfn(unsigned long sec)
1161 : : {
1162 : : return sec << PFN_SECTION_SHIFT;
1163 : : }
1164 : :
1165 : : #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
1166 : : #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1167 : :
1168 : : #define SUBSECTION_SHIFT 21
1169 : :
1170 : : #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
1171 : : #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
1172 : : #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
1173 : :
1174 : : #if SUBSECTION_SHIFT > SECTION_SIZE_BITS
1175 : : #error Subsection size exceeds section size
1176 : : #else
1177 : : #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
1178 : : #endif
1179 : :
1180 : : #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
1181 : : #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
1182 : :
1183 : : struct mem_section_usage {
1184 : : DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
1185 : : /* See declaration of similar field in struct zone */
1186 : : unsigned long pageblock_flags[0];
1187 : : };
1188 : :
1189 : : void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
1190 : :
1191 : : struct page;
1192 : : struct page_ext;
1193 : : struct mem_section {
1194 : : /*
1195 : : * This is, logically, a pointer to an array of struct
1196 : : * pages. However, it is stored with some other magic.
1197 : : * (see sparse.c::sparse_init_one_section())
1198 : : *
1199 : : * Additionally during early boot we encode node id of
1200 : : * the location of the section here to guide allocation.
1201 : : * (see sparse.c::memory_present())
1202 : : *
1203 : : * Making it a UL at least makes someone do a cast
1204 : : * before using it wrong.
1205 : : */
1206 : : unsigned long section_mem_map;
1207 : :
1208 : : struct mem_section_usage *usage;
1209 : : #ifdef CONFIG_PAGE_EXTENSION
1210 : : /*
1211 : : * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
1212 : : * section. (see page_ext.h about this.)
1213 : : */
1214 : : struct page_ext *page_ext;
1215 : : unsigned long pad;
1216 : : #endif
1217 : : /*
1218 : : * WARNING: mem_section must be a power-of-2 in size for the
1219 : : * calculation and use of SECTION_ROOT_MASK to make sense.
1220 : : */
1221 : : };
1222 : :
1223 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1224 : : #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
1225 : : #else
1226 : : #define SECTIONS_PER_ROOT 1
1227 : : #endif
1228 : :
1229 : : #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
1230 : : #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
1231 : : #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
1232 : :
1233 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1234 : : extern struct mem_section **mem_section;
1235 : : #else
1236 : : extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
1237 : : #endif
1238 : :
1239 : : static inline unsigned long *section_to_usemap(struct mem_section *ms)
1240 : : {
1241 : : return ms->usage->pageblock_flags;
1242 : : }
1243 : :
1244 : : static inline struct mem_section *__nr_to_section(unsigned long nr)
1245 : : {
1246 : : #ifdef CONFIG_SPARSEMEM_EXTREME
1247 : : if (!mem_section)
1248 : : return NULL;
1249 : : #endif
1250 : : if (!mem_section[SECTION_NR_TO_ROOT(nr)])
1251 : : return NULL;
1252 : : return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
1253 : : }
1254 : : extern unsigned long __section_nr(struct mem_section *ms);
1255 : : extern size_t mem_section_usage_size(void);
1256 : :
1257 : : /*
1258 : : * We use the lower bits of the mem_map pointer to store
1259 : : * a little bit of information. The pointer is calculated
1260 : : * as mem_map - section_nr_to_pfn(pnum). The result is
1261 : : * aligned to the minimum alignment of the two values:
1262 : : * 1. All mem_map arrays are page-aligned.
1263 : : * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
1264 : : * lowest bits. PFN_SECTION_SHIFT is arch-specific
1265 : : * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
1266 : : * worst combination is powerpc with 256k pages,
1267 : : * which results in PFN_SECTION_SHIFT equal 6.
1268 : : * To sum it up, at least 6 bits are available.
1269 : : */
1270 : : #define SECTION_MARKED_PRESENT (1UL<<0)
1271 : : #define SECTION_HAS_MEM_MAP (1UL<<1)
1272 : : #define SECTION_IS_ONLINE (1UL<<2)
1273 : : #define SECTION_IS_EARLY (1UL<<3)
1274 : : #define SECTION_MAP_LAST_BIT (1UL<<4)
1275 : : #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
1276 : : #define SECTION_NID_SHIFT 3
1277 : :
1278 : : static inline struct page *__section_mem_map_addr(struct mem_section *section)
1279 : : {
1280 : : unsigned long map = section->section_mem_map;
1281 : : map &= SECTION_MAP_MASK;
1282 : : return (struct page *)map;
1283 : : }
1284 : :
1285 : : static inline int present_section(struct mem_section *section)
1286 : : {
1287 : : return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
1288 : : }
1289 : :
1290 : : static inline int present_section_nr(unsigned long nr)
1291 : : {
1292 : : return present_section(__nr_to_section(nr));
1293 : : }
1294 : :
1295 : : static inline int valid_section(struct mem_section *section)
1296 : : {
1297 : : return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
1298 : : }
1299 : :
1300 : : static inline int early_section(struct mem_section *section)
1301 : : {
1302 : : return (section && (section->section_mem_map & SECTION_IS_EARLY));
1303 : : }
1304 : :
1305 : : static inline int valid_section_nr(unsigned long nr)
1306 : : {
1307 : : return valid_section(__nr_to_section(nr));
1308 : : }
1309 : :
1310 : : static inline int online_section(struct mem_section *section)
1311 : : {
1312 : : return (section && (section->section_mem_map & SECTION_IS_ONLINE));
1313 : : }
1314 : :
1315 : : static inline int online_section_nr(unsigned long nr)
1316 : : {
1317 : : return online_section(__nr_to_section(nr));
1318 : : }
1319 : :
1320 : : #ifdef CONFIG_MEMORY_HOTPLUG
1321 : : void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1322 : : #ifdef CONFIG_MEMORY_HOTREMOVE
1323 : : void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1324 : : #endif
1325 : : #endif
1326 : :
1327 : : static inline struct mem_section *__pfn_to_section(unsigned long pfn)
1328 : : {
1329 : : return __nr_to_section(pfn_to_section_nr(pfn));
1330 : : }
1331 : :
1332 : : extern unsigned long __highest_present_section_nr;
1333 : :
1334 : : static inline int subsection_map_index(unsigned long pfn)
1335 : : {
1336 : : return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
1337 : : }
1338 : :
1339 : : #ifdef CONFIG_SPARSEMEM_VMEMMAP
1340 : : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1341 : : {
1342 : : int idx = subsection_map_index(pfn);
1343 : :
1344 : : return test_bit(idx, ms->usage->subsection_map);
1345 : : }
1346 : : #else
1347 : : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1348 : : {
1349 : : return 1;
1350 : : }
1351 : : #endif
1352 : :
1353 : : #ifndef CONFIG_HAVE_ARCH_PFN_VALID
1354 : : static inline int pfn_valid(unsigned long pfn)
1355 : : {
1356 : : struct mem_section *ms;
1357 : :
1358 : : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1359 : : return 0;
1360 : : ms = __nr_to_section(pfn_to_section_nr(pfn));
1361 : : if (!valid_section(ms))
1362 : : return 0;
1363 : : /*
1364 : : * Traditionally early sections always returned pfn_valid() for
1365 : : * the entire section-sized span.
1366 : : */
1367 : : return early_section(ms) || pfn_section_valid(ms, pfn);
1368 : : }
1369 : : #endif
1370 : :
1371 : : static inline int pfn_present(unsigned long pfn)
1372 : : {
1373 : : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1374 : : return 0;
1375 : : return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
1376 : : }
1377 : :
1378 : : /*
1379 : : * These are _only_ used during initialisation, therefore they
1380 : : * can use __initdata ... They could have names to indicate
1381 : : * this restriction.
1382 : : */
1383 : : #ifdef CONFIG_NUMA
1384 : : #define pfn_to_nid(pfn) \
1385 : : ({ \
1386 : : unsigned long __pfn_to_nid_pfn = (pfn); \
1387 : : page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
1388 : : })
1389 : : #else
1390 : : #define pfn_to_nid(pfn) (0)
1391 : : #endif
1392 : :
1393 : : #define early_pfn_valid(pfn) pfn_valid(pfn)
1394 : : void sparse_init(void);
1395 : : #else
1396 : : #define sparse_init() do {} while (0)
1397 : : #define sparse_index_init(_sec, _nid) do {} while (0)
1398 : : #define pfn_present pfn_valid
1399 : : #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
1400 : : #endif /* CONFIG_SPARSEMEM */
1401 : :
1402 : : /*
1403 : : * During memory init memblocks map pfns to nids. The search is expensive and
1404 : : * this caches recent lookups. The implementation of __early_pfn_to_nid
1405 : : * may treat start/end as pfns or sections.
1406 : : */
1407 : : struct mminit_pfnnid_cache {
1408 : : unsigned long last_start;
1409 : : unsigned long last_end;
1410 : : int last_nid;
1411 : : };
1412 : :
1413 : : #ifndef early_pfn_valid
1414 : : #define early_pfn_valid(pfn) (1)
1415 : : #endif
1416 : :
1417 : : void memory_present(int nid, unsigned long start, unsigned long end);
1418 : :
1419 : : /*
1420 : : * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
1421 : : * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
1422 : : * pfn_valid_within() should be used in this case; we optimise this away
1423 : : * when we have no holes within a MAX_ORDER_NR_PAGES block.
1424 : : */
1425 : : #ifdef CONFIG_HOLES_IN_ZONE
1426 : : #define pfn_valid_within(pfn) pfn_valid(pfn)
1427 : : #else
1428 : : #define pfn_valid_within(pfn) (1)
1429 : : #endif
1430 : :
1431 : : #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
1432 : : /*
1433 : : * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
1434 : : * associated with it or not. This means that a struct page exists for this
1435 : : * pfn. The caller cannot assume the page is fully initialized in general.
1436 : : * Hotplugable pages might not have been onlined yet. pfn_to_online_page()
1437 : : * will ensure the struct page is fully online and initialized. Special pages
1438 : : * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly.
1439 : : *
1440 : : * In FLATMEM, it is expected that holes always have valid memmap as long as
1441 : : * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed
1442 : : * that a valid section has a memmap for the entire section.
1443 : : *
1444 : : * However, an ARM, and maybe other embedded architectures in the future
1445 : : * free memmap backing holes to save memory on the assumption the memmap is
1446 : : * never used. The page_zone linkages are then broken even though pfn_valid()
1447 : : * returns true. A walker of the full memmap must then do this additional
1448 : : * check to ensure the memmap they are looking at is sane by making sure
1449 : : * the zone and PFN linkages are still valid. This is expensive, but walkers
1450 : : * of the full memmap are extremely rare.
1451 : : */
1452 : : bool memmap_valid_within(unsigned long pfn,
1453 : : struct page *page, struct zone *zone);
1454 : : #else
1455 : : static inline bool memmap_valid_within(unsigned long pfn,
1456 : : struct page *page, struct zone *zone)
1457 : : {
1458 : : return true;
1459 : : }
1460 : : #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
1461 : :
1462 : : #endif /* !__GENERATING_BOUNDS.H */
1463 : : #endif /* !__ASSEMBLY__ */
1464 : : #endif /* _LINUX_MMZONE_H */
|