Branch data Line data Source code
1 : : // SPDX-License-Identifier: GPL-2.0
2 : : #include <linux/pagewalk.h>
3 : : #include <linux/highmem.h>
4 : : #include <linux/sched.h>
5 : : #include <linux/hugetlb.h>
6 : :
7 : : /*
8 : : * We want to know the real level where a entry is located ignoring any
9 : : * folding of levels which may be happening. For example if p4d is folded then
10 : : * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11 : : */
12 : 0 : static int real_depth(int depth)
13 : : {
14 : 0 : if (depth == 3 && PTRS_PER_PMD == 1)
15 : : depth = 2;
16 : 0 : if (depth == 2 && PTRS_PER_PUD == 1)
17 : : depth = 1;
18 : 0 : if (depth == 1 && PTRS_PER_P4D == 1)
19 : 0 : depth = 0;
20 : 0 : return depth;
21 : : }
22 : :
23 : 0 : static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
24 : : unsigned long end, struct mm_walk *walk)
25 : : {
26 : 0 : const struct mm_walk_ops *ops = walk->ops;
27 : 0 : int err = 0;
28 : :
29 : 0 : for (;;) {
30 : 0 : err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31 [ # # # # ]: 0 : if (err)
32 : : break;
33 [ # # # # ]: 0 : if (addr >= end - PAGE_SIZE)
34 : : break;
35 : 0 : addr += PAGE_SIZE;
36 : 0 : pte++;
37 : : }
38 : 0 : return err;
39 : : }
40 : :
41 : 0 : static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
42 : : struct mm_walk *walk)
43 : : {
44 : 0 : pte_t *pte;
45 : 0 : int err = 0;
46 : 0 : spinlock_t *ptl;
47 : :
48 [ # # ]: 0 : if (walk->no_vma) {
49 [ # # ]: 0 : pte = pte_offset_map(pmd, addr);
50 : 0 : err = walk_pte_range_inner(pte, addr, end, walk);
51 : : pte_unmap(pte);
52 : : } else {
53 [ # # ]: 0 : pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
54 : 0 : err = walk_pte_range_inner(pte, addr, end, walk);
55 : 0 : pte_unmap_unlock(pte, ptl);
56 : : }
57 : :
58 : 0 : return err;
59 : : }
60 : :
61 : 0 : static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
62 : : struct mm_walk *walk)
63 : : {
64 : 0 : pmd_t *pmd;
65 : 0 : unsigned long next;
66 : 0 : const struct mm_walk_ops *ops = walk->ops;
67 : 0 : int err = 0;
68 : 0 : int depth = real_depth(3);
69 : :
70 [ # # ]: 0 : pmd = pmd_offset(pud, addr);
71 : 0 : do {
72 : 0 : again:
73 [ # # ]: 0 : next = pmd_addr_end(addr, end);
74 [ # # # # : 0 : if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
# # ]
75 [ # # ]: 0 : if (ops->pte_hole)
76 : 0 : err = ops->pte_hole(addr, next, depth, walk);
77 [ # # ]: 0 : if (err)
78 : : break;
79 : 0 : continue;
80 : : }
81 : :
82 : 0 : walk->action = ACTION_SUBTREE;
83 : :
84 : : /*
85 : : * This implies that each ->pmd_entry() handler
86 : : * needs to know about pmd_trans_huge() pmds
87 : : */
88 [ # # ]: 0 : if (ops->pmd_entry)
89 : 0 : err = ops->pmd_entry(pmd, addr, next, walk);
90 [ # # ]: 0 : if (err)
91 : : break;
92 : :
93 [ # # ]: 0 : if (walk->action == ACTION_AGAIN)
94 : 0 : goto again;
95 : :
96 : : /*
97 : : * Check this here so we only break down trans_huge
98 : : * pages when we _need_ to
99 : : */
100 [ # # # # : 0 : if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
# # # # ]
101 : 0 : walk->action == ACTION_CONTINUE ||
102 [ # # ]: 0 : !(ops->pte_entry))
103 : 0 : continue;
104 : :
105 : 0 : if (walk->vma) {
106 : : split_huge_pmd(walk->vma, pmd, addr);
107 : : if (pmd_trans_unstable(pmd))
108 : : goto again;
109 : : }
110 : :
111 : 0 : err = walk_pte_range(pmd, addr, next, walk);
112 [ # # ]: 0 : if (err)
113 : : break;
114 [ # # ]: 0 : } while (pmd++, addr = next, addr != end);
115 : :
116 : 0 : return err;
117 : : }
118 : :
119 : 0 : static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
120 : : struct mm_walk *walk)
121 : : {
122 : 0 : pud_t *pud;
123 : 0 : unsigned long next;
124 : 0 : const struct mm_walk_ops *ops = walk->ops;
125 : 0 : int err = 0;
126 : 0 : int depth = real_depth(2);
127 : :
128 : 0 : pud = pud_offset(p4d, addr);
129 : 0 : do {
130 : 0 : again:
131 [ # # ]: 0 : next = pud_addr_end(addr, end);
132 [ # # # # : 0 : if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
# # ]
133 [ # # ]: 0 : if (ops->pte_hole)
134 : 0 : err = ops->pte_hole(addr, next, depth, walk);
135 [ # # ]: 0 : if (err)
136 : : break;
137 : 0 : continue;
138 : : }
139 : :
140 : 0 : walk->action = ACTION_SUBTREE;
141 : :
142 [ # # ]: 0 : if (ops->pud_entry)
143 : 0 : err = ops->pud_entry(pud, addr, next, walk);
144 [ # # ]: 0 : if (err)
145 : : break;
146 : :
147 [ # # ]: 0 : if (walk->action == ACTION_AGAIN)
148 : 0 : goto again;
149 : :
150 [ # # # # : 0 : if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
# # # # ]
151 : 0 : walk->action == ACTION_CONTINUE ||
152 [ # # # # ]: 0 : !(ops->pmd_entry || ops->pte_entry))
153 : 0 : continue;
154 : :
155 : 0 : if (walk->vma)
156 : 0 : split_huge_pud(walk->vma, pud, addr);
157 [ # # ]: 0 : if (pud_none(*pud))
158 : 0 : goto again;
159 : :
160 : 0 : err = walk_pmd_range(pud, addr, next, walk);
161 [ # # ]: 0 : if (err)
162 : : break;
163 [ # # ]: 0 : } while (pud++, addr = next, addr != end);
164 : :
165 : 0 : return err;
166 : : }
167 : :
168 : 0 : static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
169 : : struct mm_walk *walk)
170 : : {
171 : 0 : p4d_t *p4d;
172 : 0 : unsigned long next;
173 : 0 : const struct mm_walk_ops *ops = walk->ops;
174 : 0 : int err = 0;
175 [ # # ]: 0 : int depth = real_depth(1);
176 : :
177 : 0 : p4d = p4d_offset(pgd, addr);
178 : 0 : do {
179 [ # # ]: 0 : next = p4d_addr_end(addr, end);
180 [ # # ]: 0 : if (p4d_none_or_clear_bad(p4d)) {
181 [ # # ]: 0 : if (ops->pte_hole)
182 : 0 : err = ops->pte_hole(addr, next, depth, walk);
183 [ # # ]: 0 : if (err)
184 : : break;
185 : 0 : continue;
186 : : }
187 [ # # ]: 0 : if (ops->p4d_entry) {
188 : 0 : err = ops->p4d_entry(p4d, addr, next, walk);
189 [ # # ]: 0 : if (err)
190 : : break;
191 : : }
192 [ # # # # : 0 : if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
# # ]
193 : 0 : err = walk_pud_range(p4d, addr, next, walk);
194 [ # # ]: 0 : if (err)
195 : : break;
196 [ # # ]: 0 : } while (p4d++, addr = next, addr != end);
197 : :
198 : 0 : return err;
199 : : }
200 : :
201 : 0 : static int walk_pgd_range(unsigned long addr, unsigned long end,
202 : : struct mm_walk *walk)
203 : : {
204 : 0 : pgd_t *pgd;
205 : 0 : unsigned long next;
206 : 0 : const struct mm_walk_ops *ops = walk->ops;
207 : 0 : int err = 0;
208 : :
209 [ # # ]: 0 : if (walk->pgd)
210 : 0 : pgd = walk->pgd + pgd_index(addr);
211 : : else
212 : 0 : pgd = pgd_offset(walk->mm, addr);
213 : 0 : do {
214 [ # # ]: 0 : next = pgd_addr_end(addr, end);
215 [ # # ]: 0 : if (pgd_none_or_clear_bad(pgd)) {
216 [ # # ]: 0 : if (ops->pte_hole)
217 : 0 : err = ops->pte_hole(addr, next, 0, walk);
218 [ # # ]: 0 : if (err)
219 : : break;
220 : 0 : continue;
221 : : }
222 [ # # ]: 0 : if (ops->pgd_entry) {
223 : 0 : err = ops->pgd_entry(pgd, addr, next, walk);
224 [ # # ]: 0 : if (err)
225 : : break;
226 : : }
227 [ # # # # : 0 : if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
# # ]
228 [ # # ]: 0 : ops->pte_entry)
229 : 0 : err = walk_p4d_range(pgd, addr, next, walk);
230 [ # # ]: 0 : if (err)
231 : : break;
232 [ # # ]: 0 : } while (pgd++, addr = next, addr != end);
233 : :
234 : 0 : return err;
235 : : }
236 : :
237 : : #ifdef CONFIG_HUGETLB_PAGE
238 : 0 : static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
239 : : unsigned long end)
240 : : {
241 : 0 : unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
242 : 0 : return boundary < end ? boundary : end;
243 : : }
244 : :
245 : 0 : static int walk_hugetlb_range(unsigned long addr, unsigned long end,
246 : : struct mm_walk *walk)
247 : : {
248 : 0 : struct vm_area_struct *vma = walk->vma;
249 : 0 : struct hstate *h = hstate_vma(vma);
250 : 0 : unsigned long next;
251 : 0 : unsigned long hmask = huge_page_mask(h);
252 : 0 : unsigned long sz = huge_page_size(h);
253 : 0 : pte_t *pte;
254 : 0 : const struct mm_walk_ops *ops = walk->ops;
255 : 0 : int err = 0;
256 : :
257 : 0 : do {
258 : 0 : next = hugetlb_entry_end(h, addr, end);
259 : 0 : pte = huge_pte_offset(walk->mm, addr & hmask, sz);
260 : :
261 [ # # ]: 0 : if (pte)
262 : 0 : err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
263 [ # # ]: 0 : else if (ops->pte_hole)
264 : 0 : err = ops->pte_hole(addr, next, -1, walk);
265 : :
266 [ # # ]: 0 : if (err)
267 : : break;
268 [ # # ]: 0 : } while (addr = next, addr != end);
269 : :
270 : 0 : return err;
271 : : }
272 : :
273 : : #else /* CONFIG_HUGETLB_PAGE */
274 : : static int walk_hugetlb_range(unsigned long addr, unsigned long end,
275 : : struct mm_walk *walk)
276 : : {
277 : : return 0;
278 : : }
279 : :
280 : : #endif /* CONFIG_HUGETLB_PAGE */
281 : :
282 : : /*
283 : : * Decide whether we really walk over the current vma on [@start, @end)
284 : : * or skip it via the returned value. Return 0 if we do walk over the
285 : : * current vma, and return 1 if we skip the vma. Negative values means
286 : : * error, where we abort the current walk.
287 : : */
288 : 0 : static int walk_page_test(unsigned long start, unsigned long end,
289 : : struct mm_walk *walk)
290 : : {
291 : 0 : struct vm_area_struct *vma = walk->vma;
292 : 0 : const struct mm_walk_ops *ops = walk->ops;
293 : :
294 [ # # ]: 0 : if (ops->test_walk)
295 : 0 : return ops->test_walk(start, end, walk);
296 : :
297 : : /*
298 : : * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
299 : : * range, so we don't walk over it as we do for normal vmas. However,
300 : : * Some callers are interested in handling hole range and they don't
301 : : * want to just ignore any single address range. Such users certainly
302 : : * define their ->pte_hole() callbacks, so let's delegate them to handle
303 : : * vma(VM_PFNMAP).
304 : : */
305 [ # # ]: 0 : if (vma->vm_flags & VM_PFNMAP) {
306 : 0 : int err = 1;
307 [ # # ]: 0 : if (ops->pte_hole)
308 : 0 : err = ops->pte_hole(start, end, -1, walk);
309 [ # # ]: 0 : return err ? err : 1;
310 : : }
311 : : return 0;
312 : : }
313 : :
314 : 0 : static int __walk_page_range(unsigned long start, unsigned long end,
315 : : struct mm_walk *walk)
316 : : {
317 : 0 : int err = 0;
318 : 0 : struct vm_area_struct *vma = walk->vma;
319 : 0 : const struct mm_walk_ops *ops = walk->ops;
320 : :
321 [ # # # # ]: 0 : if (vma && ops->pre_vma) {
322 : 0 : err = ops->pre_vma(start, end, walk);
323 [ # # ]: 0 : if (err)
324 : : return err;
325 : : }
326 : :
327 [ # # # # ]: 0 : if (vma && is_vm_hugetlb_page(vma)) {
328 [ # # ]: 0 : if (ops->hugetlb_entry)
329 : 0 : err = walk_hugetlb_range(start, end, walk);
330 : : } else
331 : 0 : err = walk_pgd_range(start, end, walk);
332 : :
333 [ # # # # ]: 0 : if (vma && ops->post_vma)
334 : 0 : ops->post_vma(walk);
335 : :
336 : : return err;
337 : : }
338 : :
339 : : /**
340 : : * walk_page_range - walk page table with caller specific callbacks
341 : : * @mm: mm_struct representing the target process of page table walk
342 : : * @start: start address of the virtual address range
343 : : * @end: end address of the virtual address range
344 : : * @ops: operation to call during the walk
345 : : * @private: private data for callbacks' usage
346 : : *
347 : : * Recursively walk the page table tree of the process represented by @mm
348 : : * within the virtual address range [@start, @end). During walking, we can do
349 : : * some caller-specific works for each entry, by setting up pmd_entry(),
350 : : * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
351 : : * callbacks, the associated entries/pages are just ignored.
352 : : * The return values of these callbacks are commonly defined like below:
353 : : *
354 : : * - 0 : succeeded to handle the current entry, and if you don't reach the
355 : : * end address yet, continue to walk.
356 : : * - >0 : succeeded to handle the current entry, and return to the caller
357 : : * with caller specific value.
358 : : * - <0 : failed to handle the current entry, and return to the caller
359 : : * with error code.
360 : : *
361 : : * Before starting to walk page table, some callers want to check whether
362 : : * they really want to walk over the current vma, typically by checking
363 : : * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
364 : : * purpose.
365 : : *
366 : : * If operations need to be staged before and committed after a vma is walked,
367 : : * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
368 : : * since it is intended to handle commit-type operations, can't return any
369 : : * errors.
370 : : *
371 : : * struct mm_walk keeps current values of some common data like vma and pmd,
372 : : * which are useful for the access from callbacks. If you want to pass some
373 : : * caller-specific data to callbacks, @private should be helpful.
374 : : *
375 : : * Locking:
376 : : * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
377 : : * because these function traverse vma list and/or access to vma's data.
378 : : */
379 : 0 : int walk_page_range(struct mm_struct *mm, unsigned long start,
380 : : unsigned long end, const struct mm_walk_ops *ops,
381 : : void *private)
382 : : {
383 : 0 : int err = 0;
384 : 0 : unsigned long next;
385 : 0 : struct vm_area_struct *vma;
386 : 0 : struct mm_walk walk = {
387 : : .ops = ops,
388 : : .mm = mm,
389 : : .private = private,
390 : : };
391 : :
392 [ # # ]: 0 : if (start >= end)
393 : : return -EINVAL;
394 : :
395 [ # # ]: 0 : if (!walk.mm)
396 : : return -EINVAL;
397 : :
398 : 0 : lockdep_assert_held(&walk.mm->mmap_sem);
399 : :
400 : 0 : vma = find_vma(walk.mm, start);
401 : 0 : do {
402 [ # # ]: 0 : if (!vma) { /* after the last vma */
403 : 0 : walk.vma = NULL;
404 : 0 : next = end;
405 [ # # ]: 0 : } else if (start < vma->vm_start) { /* outside vma */
406 : 0 : walk.vma = NULL;
407 : 0 : next = min(end, vma->vm_start);
408 : : } else { /* inside vma */
409 : 0 : walk.vma = vma;
410 : 0 : next = min(end, vma->vm_end);
411 : 0 : vma = vma->vm_next;
412 : :
413 : 0 : err = walk_page_test(start, next, &walk);
414 [ # # ]: 0 : if (err > 0) {
415 : : /*
416 : : * positive return values are purely for
417 : : * controlling the pagewalk, so should never
418 : : * be passed to the callers.
419 : : */
420 : 0 : err = 0;
421 : 0 : continue;
422 : : }
423 [ # # ]: 0 : if (err < 0)
424 : : break;
425 : : }
426 [ # # # # ]: 0 : if (walk.vma || walk.ops->pte_hole)
427 : 0 : err = __walk_page_range(start, next, &walk);
428 [ # # ]: 0 : if (err)
429 : : break;
430 [ # # ]: 0 : } while (start = next, start < end);
431 : : return err;
432 : : }
433 : :
434 : : /*
435 : : * Similar to walk_page_range() but can walk any page tables even if they are
436 : : * not backed by VMAs. Because 'unusual' entries may be walked this function
437 : : * will also not lock the PTEs for the pte_entry() callback. This is useful for
438 : : * walking the kernel pages tables or page tables for firmware.
439 : : */
440 : 0 : int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
441 : : unsigned long end, const struct mm_walk_ops *ops,
442 : : pgd_t *pgd,
443 : : void *private)
444 : : {
445 : 0 : struct mm_walk walk = {
446 : : .ops = ops,
447 : : .mm = mm,
448 : : .pgd = pgd,
449 : : .private = private,
450 : : .no_vma = true
451 : : };
452 : :
453 [ # # # # ]: 0 : if (start >= end || !walk.mm)
454 : : return -EINVAL;
455 : :
456 : 0 : lockdep_assert_held(&walk.mm->mmap_sem);
457 : :
458 : 0 : return __walk_page_range(start, end, &walk);
459 : : }
460 : :
461 : 0 : int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
462 : : void *private)
463 : : {
464 : 0 : struct mm_walk walk = {
465 : : .ops = ops,
466 : 0 : .mm = vma->vm_mm,
467 : : .vma = vma,
468 : : .private = private,
469 : : };
470 : 0 : int err;
471 : :
472 [ # # ]: 0 : if (!walk.mm)
473 : : return -EINVAL;
474 : :
475 : 0 : lockdep_assert_held(&walk.mm->mmap_sem);
476 : :
477 : 0 : err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
478 [ # # ]: 0 : if (err > 0)
479 : : return 0;
480 [ # # ]: 0 : if (err < 0)
481 : : return err;
482 : 0 : return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
483 : : }
484 : :
485 : : /**
486 : : * walk_page_mapping - walk all memory areas mapped into a struct address_space.
487 : : * @mapping: Pointer to the struct address_space
488 : : * @first_index: First page offset in the address_space
489 : : * @nr: Number of incremental page offsets to cover
490 : : * @ops: operation to call during the walk
491 : : * @private: private data for callbacks' usage
492 : : *
493 : : * This function walks all memory areas mapped into a struct address_space.
494 : : * The walk is limited to only the given page-size index range, but if
495 : : * the index boundaries cross a huge page-table entry, that entry will be
496 : : * included.
497 : : *
498 : : * Also see walk_page_range() for additional information.
499 : : *
500 : : * Locking:
501 : : * This function can't require that the struct mm_struct::mmap_sem is held,
502 : : * since @mapping may be mapped by multiple processes. Instead
503 : : * @mapping->i_mmap_rwsem must be held. This might have implications in the
504 : : * callbacks, and it's up tho the caller to ensure that the
505 : : * struct mm_struct::mmap_sem is not needed.
506 : : *
507 : : * Also this means that a caller can't rely on the struct
508 : : * vm_area_struct::vm_flags to be constant across a call,
509 : : * except for immutable flags. Callers requiring this shouldn't use
510 : : * this function.
511 : : *
512 : : * Return: 0 on success, negative error code on failure, positive number on
513 : : * caller defined premature termination.
514 : : */
515 : 0 : int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
516 : : pgoff_t nr, const struct mm_walk_ops *ops,
517 : : void *private)
518 : : {
519 : 0 : struct mm_walk walk = {
520 : : .ops = ops,
521 : : .private = private,
522 : : };
523 : 0 : struct vm_area_struct *vma;
524 : 0 : pgoff_t vba, vea, cba, cea;
525 : 0 : unsigned long start_addr, end_addr;
526 : 0 : int err = 0;
527 : :
528 : 0 : lockdep_assert_held(&mapping->i_mmap_rwsem);
529 [ # # ]: 0 : vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
530 : : first_index + nr - 1) {
531 : : /* Clip to the vma */
532 : 0 : vba = vma->vm_pgoff;
533 [ # # ]: 0 : vea = vba + vma_pages(vma);
534 : 0 : cba = first_index;
535 : 0 : cba = max(cba, vba);
536 : 0 : cea = first_index + nr;
537 : 0 : cea = min(cea, vea);
538 : :
539 : 0 : start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
540 : 0 : end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
541 [ # # ]: 0 : if (start_addr >= end_addr)
542 : 0 : continue;
543 : :
544 : 0 : walk.vma = vma;
545 : 0 : walk.mm = vma->vm_mm;
546 : :
547 : 0 : err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
548 [ # # ]: 0 : if (err > 0) {
549 : : err = 0;
550 : : break;
551 [ # # ]: 0 : } else if (err < 0)
552 : : break;
553 : :
554 : 0 : err = __walk_page_range(start_addr, end_addr, &walk);
555 [ # # ]: 0 : if (err)
556 : : break;
557 : : }
558 : :
559 : 0 : return err;
560 : : }
|