dump_pagetables.c (c5cfae12fdd50809b16482c13a94fa6cf1e45b31) | dump_pagetables.c (2ae27137b2db89365f623a7694786cf6d1acb6c7) |
---|---|
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Debug helper to dump the current kernel pagetables of the system 4 * so that we can see what the various memory ranges are set to. 5 * 6 * (C) Copyright 2008 Intel Corporation 7 * 8 * Author: Arjan van de Ven <arjan@linux.intel.com> 9 */ 10 11#include <linux/debugfs.h> 12#include <linux/kasan.h> 13#include <linux/mm.h> 14#include <linux/init.h> 15#include <linux/sched.h> 16#include <linux/seq_file.h> 17#include <linux/highmem.h> 18#include <linux/pci.h> | 1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Debug helper to dump the current kernel pagetables of the system 4 * so that we can see what the various memory ranges are set to. 5 * 6 * (C) Copyright 2008 Intel Corporation 7 * 8 * Author: Arjan van de Ven <arjan@linux.intel.com> 9 */ 10 11#include <linux/debugfs.h> 12#include <linux/kasan.h> 13#include <linux/mm.h> 14#include <linux/init.h> 15#include <linux/sched.h> 16#include <linux/seq_file.h> 17#include <linux/highmem.h> 18#include <linux/pci.h> |
19#include <linux/ptdump.h> |
|
19 20#include <asm/e820/types.h> 21#include <asm/pgtable.h> 22 23/* 24 * The dumper groups pagetable entries of the same type into one, and for 25 * that it needs to keep some state when walking, and flush this state 26 * when a "break" in the continuity is found. 27 */ 28struct pg_state { | 20 21#include <asm/e820/types.h> 22#include <asm/pgtable.h> 23 24/* 25 * The dumper groups pagetable entries of the same type into one, and for 26 * that it needs to keep some state when walking, and flush this state 27 * when a "break" in the continuity is found. 28 */ 29struct pg_state { |
30 struct ptdump_state ptdump; |
|
29 int level; | 31 int level; |
30 pgprot_t current_prot; | 32 pgprotval_t current_prot; |
31 pgprotval_t effective_prot; | 33 pgprotval_t effective_prot; |
34 pgprotval_t prot_levels[5]; |
|
32 unsigned long start_address; | 35 unsigned long start_address; |
33 unsigned long current_address; | |
34 const struct addr_marker *marker; 35 unsigned long lines; 36 bool to_dmesg; 37 bool check_wx; 38 unsigned long wx_pages; 39 struct seq_file *seq; 40}; 41 --- 128 unchanged lines hidden (view full) --- 170 else \ 171 if (m) \ 172 seq_printf(m, fmt, ##args); \ 173}) 174 175/* 176 * Print a readable form of a pgprot_t to the seq_file 177 */ | 36 const struct addr_marker *marker; 37 unsigned long lines; 38 bool to_dmesg; 39 bool check_wx; 40 unsigned long wx_pages; 41 struct seq_file *seq; 42}; 43 --- 128 unchanged lines hidden (view full) --- 172 else \ 173 if (m) \ 174 seq_printf(m, fmt, ##args); \ 175}) 176 177/* 178 * Print a readable form of a pgprot_t to the seq_file 179 */ |
178static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) | 180static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) |
179{ | 181{ |
180 pgprotval_t pr = pgprot_val(prot); | |
181 static const char * const level_name[] = 182 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 183 184 if (!(pr & _PAGE_PRESENT)) { 185 /* Not present */ 186 pt_dump_cont_printf(m, dmsg, " "); 187 } else { 188 if (pr & _PAGE_USER) --- 30 unchanged lines hidden (view full) --- 219 if (pr & _PAGE_NX) 220 pt_dump_cont_printf(m, dmsg, "NX "); 221 else 222 pt_dump_cont_printf(m, dmsg, "x "); 223 } 224 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 225} 226 | 182 static const char * const level_name[] = 183 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 184 185 if (!(pr & _PAGE_PRESENT)) { 186 /* Not present */ 187 pt_dump_cont_printf(m, dmsg, " "); 188 } else { 189 if (pr & _PAGE_USER) --- 30 unchanged lines hidden (view full) --- 220 if (pr & _PAGE_NX) 221 pt_dump_cont_printf(m, dmsg, "NX "); 222 else 223 pt_dump_cont_printf(m, dmsg, "x "); 224 } 225 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); 226} 227 |
227/* 228 * On 64 bits, sign-extend the 48 bit address to 64 bit 229 */ 230static unsigned long normalize_addr(unsigned long u) | 228static void note_wx(struct pg_state *st, unsigned long addr) |
231{ | 229{ |
232 int shift; 233 if (!IS_ENABLED(CONFIG_X86_64)) 234 return u; 235 236 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 237 return (signed long)(u << shift) >> shift; 238} 239 240static void note_wx(struct pg_state *st) 241{ | |
242 unsigned long npages; 243 | 230 unsigned long npages; 231 |
244 npages = (st->current_address - st->start_address) / PAGE_SIZE; | 232 npages = (addr - st->start_address) / PAGE_SIZE; |
245 246#ifdef CONFIG_PCI_BIOS 247 /* 248 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. 249 * Inform about it, but avoid the warning. 250 */ 251 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && | 233 234#ifdef CONFIG_PCI_BIOS 235 /* 236 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. 237 * Inform about it, but avoid the warning. 238 */ 239 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && |
252 st->current_address <= PAGE_OFFSET + BIOS_END) { | 240 addr <= PAGE_OFFSET + BIOS_END) { |
253 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); 254 return; 255 } 256#endif 257 /* Account the WX pages */ 258 st->wx_pages += npages; 259 WARN_ONCE(__supported_pte_mask & _PAGE_NX, 260 "x86/mm: Found insecure W+X mapping at address %pS\n", 261 (void *)st->start_address); 262} 263 | 241 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); 242 return; 243 } 244#endif 245 /* Account the WX pages */ 246 st->wx_pages += npages; 247 WARN_ONCE(__supported_pte_mask & _PAGE_NX, 248 "x86/mm: Found insecure W+X mapping at address %pS\n", 249 (void *)st->start_address); 250} 251 |
252static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) 253{ 254 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | 255 ((prot1 | prot2) & _PAGE_NX); 256} 257 |
|
264/* 265 * This function gets called on a break in a continuous series 266 * of PTE entries; the next one is different so we need to 267 * print what we collected so far. 268 */ | 258/* 259 * This function gets called on a break in a continuous series 260 * of PTE entries; the next one is different so we need to 261 * print what we collected so far. 262 */ |
269static void note_page(struct pg_state *st, pgprot_t new_prot, 270 pgprotval_t new_eff, int level) | 263static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, 264 unsigned long val) |
271{ | 265{ |
272 pgprotval_t prot, cur, eff; | 266 struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); 267 pgprotval_t new_prot, new_eff; 268 pgprotval_t cur, eff; |
273 static const char units[] = "BKMGTPE"; 274 struct seq_file *m = st->seq; 275 | 269 static const char units[] = "BKMGTPE"; 270 struct seq_file *m = st->seq; 271 |
272 new_prot = val & PTE_FLAGS_MASK; 273 274 if (level > 1) { 275 new_eff = effective_prot(st->prot_levels[level - 2], 276 new_prot); 277 } else { 278 new_eff = new_prot; 279 } 280 281 if (level > 0) 282 st->prot_levels[level - 1] = new_eff; 283 |
|
276 /* 277 * If we have a "break" in the series, we need to flush the state that 278 * we have now. "break" is either changing perms, levels or 279 * address space marker. 280 */ | 284 /* 285 * If we have a "break" in the series, we need to flush the state that 286 * we have now. "break" is either changing perms, levels or 287 * address space marker. 288 */ |
281 prot = pgprot_val(new_prot); 282 cur = pgprot_val(st->current_prot); | 289 cur = st->current_prot; |
283 eff = st->effective_prot; 284 285 if (!st->level) { 286 /* First entry */ 287 st->current_prot = new_prot; 288 st->effective_prot = new_eff; 289 st->level = level; 290 st->marker = address_markers; 291 st->lines = 0; 292 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 293 st->marker->name); | 290 eff = st->effective_prot; 291 292 if (!st->level) { 293 /* First entry */ 294 st->current_prot = new_prot; 295 st->effective_prot = new_eff; 296 st->level = level; 297 st->marker = address_markers; 298 st->lines = 0; 299 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 300 st->marker->name); |
294 } else if (prot != cur || new_eff != eff || level != st->level || 295 st->current_address >= st->marker[1].start_address) { | 301 } else if (new_prot != cur || new_eff != eff || level != st->level || 302 addr >= st->marker[1].start_address) { |
296 const char *unit = units; 297 unsigned long delta; 298 int width = sizeof(unsigned long) * 2; 299 300 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) | 303 const char *unit = units; 304 unsigned long delta; 305 int width = sizeof(unsigned long) * 2; 306 307 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) |
301 note_wx(st); | 308 note_wx(st, addr); |
302 303 /* 304 * Now print the actual finished series 305 */ 306 if (!st->marker->max_lines || 307 st->lines < st->marker->max_lines) { 308 pt_dump_seq_printf(m, st->to_dmesg, 309 "0x%0*lx-0x%0*lx ", 310 width, st->start_address, | 309 310 /* 311 * Now print the actual finished series 312 */ 313 if (!st->marker->max_lines || 314 st->lines < st->marker->max_lines) { 315 pt_dump_seq_printf(m, st->to_dmesg, 316 "0x%0*lx-0x%0*lx ", 317 width, st->start_address, |
311 width, st->current_address); | 318 width, addr); |
312 | 319 |
313 delta = st->current_address - st->start_address; | 320 delta = addr - st->start_address; |
314 while (!(delta & 1023) && unit[1]) { 315 delta >>= 10; 316 unit++; 317 } 318 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 319 delta, *unit); 320 printk_prot(m, st->current_prot, st->level, 321 st->to_dmesg); 322 } 323 st->lines++; 324 325 /* 326 * We print markers for special areas of address space, 327 * such as the start of vmalloc space etc. 328 * This helps in the interpretation. 329 */ | 321 while (!(delta & 1023) && unit[1]) { 322 delta >>= 10; 323 unit++; 324 } 325 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", 326 delta, *unit); 327 printk_prot(m, st->current_prot, st->level, 328 st->to_dmesg); 329 } 330 st->lines++; 331 332 /* 333 * We print markers for special areas of address space, 334 * such as the start of vmalloc space etc. 335 * This helps in the interpretation. 336 */ |
330 if (st->current_address >= st->marker[1].start_address) { | 337 if (addr >= st->marker[1].start_address) { |
331 if (st->marker->max_lines && 332 st->lines > st->marker->max_lines) { 333 unsigned long nskip = 334 st->lines - st->marker->max_lines; 335 pt_dump_seq_printf(m, st->to_dmesg, 336 "... %lu entr%s skipped ... \n", 337 nskip, 338 nskip == 1 ? "y" : "ies"); 339 } 340 st->marker++; 341 st->lines = 0; 342 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 343 st->marker->name); 344 } 345 | 338 if (st->marker->max_lines && 339 st->lines > st->marker->max_lines) { 340 unsigned long nskip = 341 st->lines - st->marker->max_lines; 342 pt_dump_seq_printf(m, st->to_dmesg, 343 "... %lu entr%s skipped ... \n", 344 nskip, 345 nskip == 1 ? "y" : "ies"); 346 } 347 st->marker++; 348 st->lines = 0; 349 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", 350 st->marker->name); 351 } 352 |
346 st->start_address = st->current_address; | 353 st->start_address = addr; |
347 st->current_prot = new_prot; 348 st->effective_prot = new_eff; 349 st->level = level; 350 } 351} 352 | 354 st->current_prot = new_prot; 355 st->effective_prot = new_eff; 356 st->level = level; 357 } 358} 359 |
353static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) 354{ 355 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | 356 ((prot1 | prot2) & _PAGE_NX); 357} 358 359static void walk_pte_level(struct pg_state *st, pmd_t addr, pgprotval_t eff_in, 360 unsigned long P) 361{ 362 int i; 363 pte_t *pte; 364 pgprotval_t prot, eff; 365 366 for (i = 0; i < PTRS_PER_PTE; i++) { 367 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 368 pte = pte_offset_map(&addr, st->current_address); 369 prot = pte_flags(*pte); 370 eff = effective_prot(eff_in, prot); 371 note_page(st, __pgprot(prot), eff, 5); 372 pte_unmap(pte); 373 } 374} 375#ifdef CONFIG_KASAN 376 377/* 378 * This is an optimization for KASAN=y case. Since all kasan page tables 379 * eventually point to the kasan_early_shadow_page we could call note_page() 380 * right away without walking through lower level page tables. This saves 381 * us dozens of seconds (minutes for 5-level config) while checking for 382 * W+X mapping or reading kernel_page_tables debugfs file. 383 */ 384static inline bool kasan_page_table(struct pg_state *st, void *pt) 385{ 386 if (__pa(pt) == __pa(kasan_early_shadow_pmd) || 387 (pgtable_l5_enabled() && 388 __pa(pt) == __pa(kasan_early_shadow_p4d)) || 389 __pa(pt) == __pa(kasan_early_shadow_pud)) { 390 pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]); 391 note_page(st, __pgprot(prot), 0, 5); 392 return true; 393 } 394 return false; 395} 396#else 397static inline bool kasan_page_table(struct pg_state *st, void *pt) 398{ 399 return false; 400} 401#endif 402 403#if PTRS_PER_PMD > 1 404 405static void walk_pmd_level(struct pg_state *st, pud_t addr, 406 pgprotval_t eff_in, unsigned long P) 407{ 408 int i; 409 pmd_t *start, *pmd_start; 410 pgprotval_t prot, eff; 411 412 pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 413 for (i = 0; i < PTRS_PER_PMD; i++) { 414 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 415 if (!pmd_none(*start)) { 416 prot = pmd_flags(*start); 417 eff = effective_prot(eff_in, prot); 418 if (pmd_large(*start) || !pmd_present(*start)) { 419 note_page(st, __pgprot(prot), eff, 4); 420 } else if (!kasan_page_table(st, pmd_start)) { 421 walk_pte_level(st, *start, eff, 422 P + i * PMD_LEVEL_MULT); 423 } 424 } else 425 note_page(st, __pgprot(0), 0, 4); 426 start++; 427 } 428} 429 430#else 431#define walk_pmd_level(s,a,e,p) walk_pte_level(s,__pmd(pud_val(a)),e,p) 432#define pud_large(a) pmd_large(__pmd(pud_val(a))) 433#define pud_none(a) pmd_none(__pmd(pud_val(a))) 434#endif 435 436#if PTRS_PER_PUD > 1 437 438static void walk_pud_level(struct pg_state *st, p4d_t addr, pgprotval_t eff_in, 439 unsigned long P) 440{ 441 int i; 442 pud_t *start, *pud_start; 443 pgprotval_t prot, eff; 444 445 pud_start = start = (pud_t *)p4d_page_vaddr(addr); 446 447 for (i = 0; i < PTRS_PER_PUD; i++) { 448 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 449 if (!pud_none(*start)) { 450 prot = pud_flags(*start); 451 eff = effective_prot(eff_in, prot); 452 if (pud_large(*start) || !pud_present(*start)) { 453 note_page(st, __pgprot(prot), eff, 3); 454 } else if (!kasan_page_table(st, pud_start)) { 455 walk_pmd_level(st, *start, eff, 456 P + i * PUD_LEVEL_MULT); 457 } 458 } else 459 note_page(st, __pgprot(0), 0, 3); 460 461 start++; 462 } 463} 464 465#else 466#define walk_pud_level(s,a,e,p) walk_pmd_level(s,__pud(p4d_val(a)),e,p) 467#define p4d_large(a) pud_large(__pud(p4d_val(a))) 468#define p4d_none(a) pud_none(__pud(p4d_val(a))) 469#endif 470 471static void walk_p4d_level(struct pg_state *st, pgd_t addr, pgprotval_t eff_in, 472 unsigned long P) 473{ 474 int i; 475 p4d_t *start, *p4d_start; 476 pgprotval_t prot, eff; 477 478 if (PTRS_PER_P4D == 1) 479 return walk_pud_level(st, __p4d(pgd_val(addr)), eff_in, P); 480 481 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 482 483 for (i = 0; i < PTRS_PER_P4D; i++) { 484 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); 485 if (!p4d_none(*start)) { 486 prot = p4d_flags(*start); 487 eff = effective_prot(eff_in, prot); 488 if (p4d_large(*start) || !p4d_present(*start)) { 489 note_page(st, __pgprot(prot), eff, 2); 490 } else if (!kasan_page_table(st, p4d_start)) { 491 walk_pud_level(st, *start, eff, 492 P + i * P4D_LEVEL_MULT); 493 } 494 } else 495 note_page(st, __pgprot(0), 0, 2); 496 497 start++; 498 } 499} 500 501#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) 502#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) 503 504static inline bool is_hypervisor_range(int idx) 505{ 506#ifdef CONFIG_X86_64 507 /* 508 * A hole in the beginning of kernel address space reserved 509 * for a hypervisor. 510 */ 511 return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && 512 (idx < pgd_index(GUARD_HOLE_END_ADDR)); 513#else 514 return false; 515#endif 516} 517 | |
518static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 519 bool checkwx, bool dmesg) 520{ | 360static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 361 bool checkwx, bool dmesg) 362{ |
521 pgd_t *start = pgd; 522 pgprotval_t prot, eff; 523 int i; 524 struct pg_state st = {}; | 363 const struct ptdump_range ptdump_ranges[] = { 364#ifdef CONFIG_X86_64 |
525 | 365 |
526 st.to_dmesg = dmesg; 527 st.check_wx = checkwx; 528 st.seq = m; 529 if (checkwx) 530 st.wx_pages = 0; | 366#define normalize_addr_shift (64 - (__VIRTUAL_MASK_SHIFT + 1)) 367#define normalize_addr(u) ((signed long)((u) << normalize_addr_shift) >> \ 368 normalize_addr_shift) |
531 | 369 |
532 for (i = 0; i < PTRS_PER_PGD; i++) { 533 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 534 if (!pgd_none(*start) && !is_hypervisor_range(i)) { 535 prot = pgd_flags(*start); 536#ifdef CONFIG_X86_PAE 537 eff = _PAGE_USER | _PAGE_RW; | 370 {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, 371 {normalize_addr(PTRS_PER_PGD * PGD_LEVEL_MULT / 2), ~0UL}, |
538#else | 372#else |
539 eff = prot; | 373 {0, ~0UL}, |
540#endif | 374#endif |
541 if (pgd_large(*start) || !pgd_present(*start)) { 542 note_page(&st, __pgprot(prot), eff, 1); 543 } else { 544 walk_p4d_level(&st, *start, eff, 545 i * PGD_LEVEL_MULT); 546 } 547 } else 548 note_page(&st, __pgprot(0), 0, 1); | 375 {0, 0} 376}; |
549 | 377 |
550 cond_resched(); 551 start++; 552 } | 378 struct pg_state st = { 379 .ptdump = { 380 .note_page = note_page, 381 .range = ptdump_ranges 382 }, 383 .to_dmesg = dmesg, 384 .check_wx = checkwx, 385 .seq = m 386 }; |
553 | 387 |
554 /* Flush out the last page */ 555 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); 556 note_page(&st, __pgprot(0), 0, 0); | 388 struct mm_struct fake_mm = { 389 .pgd = pgd 390 }; 391 init_rwsem(&fake_mm.mmap_sem); 392 393 ptdump_walk_pgd(&st.ptdump, &fake_mm); 394 |
557 if (!checkwx) 558 return; 559 if (st.wx_pages) 560 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 561 st.wx_pages); 562 else 563 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 564} --- 71 unchanged lines hidden --- | 395 if (!checkwx) 396 return; 397 if (st.wx_pages) 398 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", 399 st.wx_pages); 400 else 401 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); 402} --- 71 unchanged lines hidden --- |