dump_pagetables.c (c5cfae12fdd50809b16482c13a94fa6cf1e45b31) dump_pagetables.c (2ae27137b2db89365f623a7694786cf6d1acb6c7)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Debug helper to dump the current kernel pagetables of the system
4 * so that we can see what the various memory ranges are set to.
5 *
6 * (C) Copyright 2008 Intel Corporation
7 *
8 * Author: Arjan van de Ven <arjan@linux.intel.com>
9 */
10
11#include <linux/debugfs.h>
12#include <linux/kasan.h>
13#include <linux/mm.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/seq_file.h>
17#include <linux/highmem.h>
18#include <linux/pci.h>
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Debug helper to dump the current kernel pagetables of the system
4 * so that we can see what the various memory ranges are set to.
5 *
6 * (C) Copyright 2008 Intel Corporation
7 *
8 * Author: Arjan van de Ven <arjan@linux.intel.com>
9 */
10
11#include <linux/debugfs.h>
12#include <linux/kasan.h>
13#include <linux/mm.h>
14#include <linux/init.h>
15#include <linux/sched.h>
16#include <linux/seq_file.h>
17#include <linux/highmem.h>
18#include <linux/pci.h>
19#include <linux/ptdump.h>
19
20#include <asm/e820/types.h>
21#include <asm/pgtable.h>
22
23/*
24 * The dumper groups pagetable entries of the same type into one, and for
25 * that it needs to keep some state when walking, and flush this state
26 * when a "break" in the continuity is found.
27 */
28struct pg_state {
20
21#include <asm/e820/types.h>
22#include <asm/pgtable.h>
23
24/*
25 * The dumper groups pagetable entries of the same type into one, and for
26 * that it needs to keep some state when walking, and flush this state
27 * when a "break" in the continuity is found.
28 */
29struct pg_state {
30 struct ptdump_state ptdump;
29 int level;
31 int level;
30 pgprot_t current_prot;
32 pgprotval_t current_prot;
31 pgprotval_t effective_prot;
33 pgprotval_t effective_prot;
34 pgprotval_t prot_levels[5];
32 unsigned long start_address;
35 unsigned long start_address;
33 unsigned long current_address;
34 const struct addr_marker *marker;
35 unsigned long lines;
36 bool to_dmesg;
37 bool check_wx;
38 unsigned long wx_pages;
39 struct seq_file *seq;
40};
41

--- 128 unchanged lines hidden (view full) ---

170 else \
171 if (m) \
172 seq_printf(m, fmt, ##args); \
173})
174
175/*
176 * Print a readable form of a pgprot_t to the seq_file
177 */
36 const struct addr_marker *marker;
37 unsigned long lines;
38 bool to_dmesg;
39 bool check_wx;
40 unsigned long wx_pages;
41 struct seq_file *seq;
42};
43

--- 128 unchanged lines hidden (view full) ---

172 else \
173 if (m) \
174 seq_printf(m, fmt, ##args); \
175})
176
177/*
178 * Print a readable form of a pgprot_t to the seq_file
179 */
178static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
180static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
179{
181{
180 pgprotval_t pr = pgprot_val(prot);
181 static const char * const level_name[] =
182 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
183
184 if (!(pr & _PAGE_PRESENT)) {
185 /* Not present */
186 pt_dump_cont_printf(m, dmsg, " ");
187 } else {
188 if (pr & _PAGE_USER)

--- 30 unchanged lines hidden (view full) ---

219 if (pr & _PAGE_NX)
220 pt_dump_cont_printf(m, dmsg, "NX ");
221 else
222 pt_dump_cont_printf(m, dmsg, "x ");
223 }
224 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
225}
226
182 static const char * const level_name[] =
183 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
184
185 if (!(pr & _PAGE_PRESENT)) {
186 /* Not present */
187 pt_dump_cont_printf(m, dmsg, " ");
188 } else {
189 if (pr & _PAGE_USER)

--- 30 unchanged lines hidden (view full) ---

220 if (pr & _PAGE_NX)
221 pt_dump_cont_printf(m, dmsg, "NX ");
222 else
223 pt_dump_cont_printf(m, dmsg, "x ");
224 }
225 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
226}
227
227/*
228 * On 64 bits, sign-extend the 48 bit address to 64 bit
229 */
230static unsigned long normalize_addr(unsigned long u)
228static void note_wx(struct pg_state *st, unsigned long addr)
231{
229{
232 int shift;
233 if (!IS_ENABLED(CONFIG_X86_64))
234 return u;
235
236 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
237 return (signed long)(u << shift) >> shift;
238}
239
240static void note_wx(struct pg_state *st)
241{
242 unsigned long npages;
243
230 unsigned long npages;
231
244 npages = (st->current_address - st->start_address) / PAGE_SIZE;
232 npages = (addr - st->start_address) / PAGE_SIZE;
245
246#ifdef CONFIG_PCI_BIOS
247 /*
248 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
249 * Inform about it, but avoid the warning.
250 */
251 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
233
234#ifdef CONFIG_PCI_BIOS
235 /*
236 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
237 * Inform about it, but avoid the warning.
238 */
239 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
252 st->current_address <= PAGE_OFFSET + BIOS_END) {
240 addr <= PAGE_OFFSET + BIOS_END) {
253 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
254 return;
255 }
256#endif
257 /* Account the WX pages */
258 st->wx_pages += npages;
259 WARN_ONCE(__supported_pte_mask & _PAGE_NX,
260 "x86/mm: Found insecure W+X mapping at address %pS\n",
261 (void *)st->start_address);
262}
263
241 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
242 return;
243 }
244#endif
245 /* Account the WX pages */
246 st->wx_pages += npages;
247 WARN_ONCE(__supported_pte_mask & _PAGE_NX,
248 "x86/mm: Found insecure W+X mapping at address %pS\n",
249 (void *)st->start_address);
250}
251
252static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
253{
254 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
255 ((prot1 | prot2) & _PAGE_NX);
256}
257
264/*
265 * This function gets called on a break in a continuous series
266 * of PTE entries; the next one is different so we need to
267 * print what we collected so far.
268 */
258/*
259 * This function gets called on a break in a continuous series
260 * of PTE entries; the next one is different so we need to
261 * print what we collected so far.
262 */
269static void note_page(struct pg_state *st, pgprot_t new_prot,
270 pgprotval_t new_eff, int level)
263static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
264 unsigned long val)
271{
265{
272 pgprotval_t prot, cur, eff;
266 struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
267 pgprotval_t new_prot, new_eff;
268 pgprotval_t cur, eff;
273 static const char units[] = "BKMGTPE";
274 struct seq_file *m = st->seq;
275
269 static const char units[] = "BKMGTPE";
270 struct seq_file *m = st->seq;
271
272 new_prot = val & PTE_FLAGS_MASK;
273
274 if (level > 1) {
275 new_eff = effective_prot(st->prot_levels[level - 2],
276 new_prot);
277 } else {
278 new_eff = new_prot;
279 }
280
281 if (level > 0)
282 st->prot_levels[level - 1] = new_eff;
283
276 /*
277 * If we have a "break" in the series, we need to flush the state that
278 * we have now. "break" is either changing perms, levels or
279 * address space marker.
280 */
284 /*
285 * If we have a "break" in the series, we need to flush the state that
286 * we have now. "break" is either changing perms, levels or
287 * address space marker.
288 */
281 prot = pgprot_val(new_prot);
282 cur = pgprot_val(st->current_prot);
289 cur = st->current_prot;
283 eff = st->effective_prot;
284
285 if (!st->level) {
286 /* First entry */
287 st->current_prot = new_prot;
288 st->effective_prot = new_eff;
289 st->level = level;
290 st->marker = address_markers;
291 st->lines = 0;
292 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
293 st->marker->name);
290 eff = st->effective_prot;
291
292 if (!st->level) {
293 /* First entry */
294 st->current_prot = new_prot;
295 st->effective_prot = new_eff;
296 st->level = level;
297 st->marker = address_markers;
298 st->lines = 0;
299 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
300 st->marker->name);
294 } else if (prot != cur || new_eff != eff || level != st->level ||
295 st->current_address >= st->marker[1].start_address) {
301 } else if (new_prot != cur || new_eff != eff || level != st->level ||
302 addr >= st->marker[1].start_address) {
296 const char *unit = units;
297 unsigned long delta;
298 int width = sizeof(unsigned long) * 2;
299
300 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
303 const char *unit = units;
304 unsigned long delta;
305 int width = sizeof(unsigned long) * 2;
306
307 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
301 note_wx(st);
308 note_wx(st, addr);
302
303 /*
304 * Now print the actual finished series
305 */
306 if (!st->marker->max_lines ||
307 st->lines < st->marker->max_lines) {
308 pt_dump_seq_printf(m, st->to_dmesg,
309 "0x%0*lx-0x%0*lx ",
310 width, st->start_address,
309
310 /*
311 * Now print the actual finished series
312 */
313 if (!st->marker->max_lines ||
314 st->lines < st->marker->max_lines) {
315 pt_dump_seq_printf(m, st->to_dmesg,
316 "0x%0*lx-0x%0*lx ",
317 width, st->start_address,
311 width, st->current_address);
318 width, addr);
312
319
313 delta = st->current_address - st->start_address;
320 delta = addr - st->start_address;
314 while (!(delta & 1023) && unit[1]) {
315 delta >>= 10;
316 unit++;
317 }
318 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
319 delta, *unit);
320 printk_prot(m, st->current_prot, st->level,
321 st->to_dmesg);
322 }
323 st->lines++;
324
325 /*
326 * We print markers for special areas of address space,
327 * such as the start of vmalloc space etc.
328 * This helps in the interpretation.
329 */
321 while (!(delta & 1023) && unit[1]) {
322 delta >>= 10;
323 unit++;
324 }
325 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
326 delta, *unit);
327 printk_prot(m, st->current_prot, st->level,
328 st->to_dmesg);
329 }
330 st->lines++;
331
332 /*
333 * We print markers for special areas of address space,
334 * such as the start of vmalloc space etc.
335 * This helps in the interpretation.
336 */
330 if (st->current_address >= st->marker[1].start_address) {
337 if (addr >= st->marker[1].start_address) {
331 if (st->marker->max_lines &&
332 st->lines > st->marker->max_lines) {
333 unsigned long nskip =
334 st->lines - st->marker->max_lines;
335 pt_dump_seq_printf(m, st->to_dmesg,
336 "... %lu entr%s skipped ... \n",
337 nskip,
338 nskip == 1 ? "y" : "ies");
339 }
340 st->marker++;
341 st->lines = 0;
342 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
343 st->marker->name);
344 }
345
338 if (st->marker->max_lines &&
339 st->lines > st->marker->max_lines) {
340 unsigned long nskip =
341 st->lines - st->marker->max_lines;
342 pt_dump_seq_printf(m, st->to_dmesg,
343 "... %lu entr%s skipped ... \n",
344 nskip,
345 nskip == 1 ? "y" : "ies");
346 }
347 st->marker++;
348 st->lines = 0;
349 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
350 st->marker->name);
351 }
352
346 st->start_address = st->current_address;
353 st->start_address = addr;
347 st->current_prot = new_prot;
348 st->effective_prot = new_eff;
349 st->level = level;
350 }
351}
352
354 st->current_prot = new_prot;
355 st->effective_prot = new_eff;
356 st->level = level;
357 }
358}
359
353static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
354{
355 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
356 ((prot1 | prot2) & _PAGE_NX);
357}
358
359static void walk_pte_level(struct pg_state *st, pmd_t addr, pgprotval_t eff_in,
360 unsigned long P)
361{
362 int i;
363 pte_t *pte;
364 pgprotval_t prot, eff;
365
366 for (i = 0; i < PTRS_PER_PTE; i++) {
367 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
368 pte = pte_offset_map(&addr, st->current_address);
369 prot = pte_flags(*pte);
370 eff = effective_prot(eff_in, prot);
371 note_page(st, __pgprot(prot), eff, 5);
372 pte_unmap(pte);
373 }
374}
375#ifdef CONFIG_KASAN
376
377/*
378 * This is an optimization for KASAN=y case. Since all kasan page tables
379 * eventually point to the kasan_early_shadow_page we could call note_page()
380 * right away without walking through lower level page tables. This saves
381 * us dozens of seconds (minutes for 5-level config) while checking for
382 * W+X mapping or reading kernel_page_tables debugfs file.
383 */
384static inline bool kasan_page_table(struct pg_state *st, void *pt)
385{
386 if (__pa(pt) == __pa(kasan_early_shadow_pmd) ||
387 (pgtable_l5_enabled() &&
388 __pa(pt) == __pa(kasan_early_shadow_p4d)) ||
389 __pa(pt) == __pa(kasan_early_shadow_pud)) {
390 pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]);
391 note_page(st, __pgprot(prot), 0, 5);
392 return true;
393 }
394 return false;
395}
396#else
397static inline bool kasan_page_table(struct pg_state *st, void *pt)
398{
399 return false;
400}
401#endif
402
403#if PTRS_PER_PMD > 1
404
405static void walk_pmd_level(struct pg_state *st, pud_t addr,
406 pgprotval_t eff_in, unsigned long P)
407{
408 int i;
409 pmd_t *start, *pmd_start;
410 pgprotval_t prot, eff;
411
412 pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
413 for (i = 0; i < PTRS_PER_PMD; i++) {
414 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
415 if (!pmd_none(*start)) {
416 prot = pmd_flags(*start);
417 eff = effective_prot(eff_in, prot);
418 if (pmd_large(*start) || !pmd_present(*start)) {
419 note_page(st, __pgprot(prot), eff, 4);
420 } else if (!kasan_page_table(st, pmd_start)) {
421 walk_pte_level(st, *start, eff,
422 P + i * PMD_LEVEL_MULT);
423 }
424 } else
425 note_page(st, __pgprot(0), 0, 4);
426 start++;
427 }
428}
429
430#else
431#define walk_pmd_level(s,a,e,p) walk_pte_level(s,__pmd(pud_val(a)),e,p)
432#define pud_large(a) pmd_large(__pmd(pud_val(a)))
433#define pud_none(a) pmd_none(__pmd(pud_val(a)))
434#endif
435
436#if PTRS_PER_PUD > 1
437
438static void walk_pud_level(struct pg_state *st, p4d_t addr, pgprotval_t eff_in,
439 unsigned long P)
440{
441 int i;
442 pud_t *start, *pud_start;
443 pgprotval_t prot, eff;
444
445 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
446
447 for (i = 0; i < PTRS_PER_PUD; i++) {
448 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
449 if (!pud_none(*start)) {
450 prot = pud_flags(*start);
451 eff = effective_prot(eff_in, prot);
452 if (pud_large(*start) || !pud_present(*start)) {
453 note_page(st, __pgprot(prot), eff, 3);
454 } else if (!kasan_page_table(st, pud_start)) {
455 walk_pmd_level(st, *start, eff,
456 P + i * PUD_LEVEL_MULT);
457 }
458 } else
459 note_page(st, __pgprot(0), 0, 3);
460
461 start++;
462 }
463}
464
465#else
466#define walk_pud_level(s,a,e,p) walk_pmd_level(s,__pud(p4d_val(a)),e,p)
467#define p4d_large(a) pud_large(__pud(p4d_val(a)))
468#define p4d_none(a) pud_none(__pud(p4d_val(a)))
469#endif
470
471static void walk_p4d_level(struct pg_state *st, pgd_t addr, pgprotval_t eff_in,
472 unsigned long P)
473{
474 int i;
475 p4d_t *start, *p4d_start;
476 pgprotval_t prot, eff;
477
478 if (PTRS_PER_P4D == 1)
479 return walk_pud_level(st, __p4d(pgd_val(addr)), eff_in, P);
480
481 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
482
483 for (i = 0; i < PTRS_PER_P4D; i++) {
484 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
485 if (!p4d_none(*start)) {
486 prot = p4d_flags(*start);
487 eff = effective_prot(eff_in, prot);
488 if (p4d_large(*start) || !p4d_present(*start)) {
489 note_page(st, __pgprot(prot), eff, 2);
490 } else if (!kasan_page_table(st, p4d_start)) {
491 walk_pud_level(st, *start, eff,
492 P + i * P4D_LEVEL_MULT);
493 }
494 } else
495 note_page(st, __pgprot(0), 0, 2);
496
497 start++;
498 }
499}
500
501#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
502#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
503
504static inline bool is_hypervisor_range(int idx)
505{
506#ifdef CONFIG_X86_64
507 /*
508 * A hole in the beginning of kernel address space reserved
509 * for a hypervisor.
510 */
511 return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
512 (idx < pgd_index(GUARD_HOLE_END_ADDR));
513#else
514 return false;
515#endif
516}
517
518static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
519 bool checkwx, bool dmesg)
520{
360static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
361 bool checkwx, bool dmesg)
362{
521 pgd_t *start = pgd;
522 pgprotval_t prot, eff;
523 int i;
524 struct pg_state st = {};
363 const struct ptdump_range ptdump_ranges[] = {
364#ifdef CONFIG_X86_64
525
365
526 st.to_dmesg = dmesg;
527 st.check_wx = checkwx;
528 st.seq = m;
529 if (checkwx)
530 st.wx_pages = 0;
366#define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1))
367#define normalize_addr(u) ((signed long)((u) << normalize_addr_shift) >> \
368 normalize_addr_shift)
531
369
532 for (i = 0; i < PTRS_PER_PGD; i++) {
533 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
534 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
535 prot = pgd_flags(*start);
536#ifdef CONFIG_X86_PAE
537 eff = _PAGE_USER | _PAGE_RW;
370 {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
371 {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL},
538#else
372#else
539 eff = prot;
373 {0, ~0UL},
540#endif
374#endif
541 if (pgd_large(*start) || !pgd_present(*start)) {
542 note_page(&st, __pgprot(prot), eff, 1);
543 } else {
544 walk_p4d_level(&st, *start, eff,
545 i * PGD_LEVEL_MULT);
546 }
547 } else
548 note_page(&st, __pgprot(0), 0, 1);
375 {0, 0}
376};
549
377
550 cond_resched();
551 start++;
552 }
378 struct pg_state st = {
379 .ptdump = {
380 .note_page = note_page,
381 .range = ptdump_ranges
382 },
383 .to_dmesg = dmesg,
384 .check_wx = checkwx,
385 .seq = m
386 };
553
387
554 /* Flush out the last page */
555 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
556 note_page(&st, __pgprot(0), 0, 0);
388 struct mm_struct fake_mm = {
389 .pgd = pgd
390 };
391 init_rwsem(&fake_mm.mmap_sem);
392
393 ptdump_walk_pgd(&st.ptdump, &fake_mm);
394
557 if (!checkwx)
558 return;
559 if (st.wx_pages)
560 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
561 st.wx_pages);
562 else
563 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
564}

--- 71 unchanged lines hidden ---
395 if (!checkwx)
396 return;
397 if (st.wx_pages)
398 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
399 st.wx_pages);
400 else
401 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
402}

--- 71 unchanged lines hidden ---