1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2014 Ian Lepore <ian@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #ifdef _KERNEL 33 #include "opt_acpi.h" 34 #include "opt_ddb.h" 35 #endif 36 37 /* 38 * Routines for describing and initializing anything related to physical memory. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/bus.h> 44 #include <sys/kernel.h> 45 #include <sys/module.h> 46 #include <sys/physmem.h> 47 48 #ifdef _KERNEL 49 #include <vm/vm.h> 50 #include <vm/vm_param.h> 51 #include <vm/vm_page.h> 52 #include <vm/vm_phys.h> 53 #include <vm/vm_dumpset.h> 54 55 #include <machine/md_var.h> 56 #include <machine/resource.h> 57 #else 58 #include <stdarg.h> 59 #include <stdio.h> 60 #include <string.h> 61 #endif 62 63 /* 64 * These structures are used internally to keep track of regions of physical 65 * ram, and regions within the physical ram that need to be excluded. An 66 * exclusion region can be excluded from crash dumps, from the vm pool of pages 67 * that can be allocated, or both, depending on the exclusion flags associated 68 * with the region. 69 */ 70 #ifdef DEV_ACPI 71 #define MAX_HWCNT 32 /* ACPI needs more regions */ 72 #define MAX_EXCNT 32 73 #else 74 #define MAX_HWCNT 16 75 #define MAX_EXCNT 16 76 #endif 77 78 #if defined(__arm__) 79 #define MAX_PHYS_ADDR 0xFFFFFFFFull 80 #elif defined(__aarch64__) || defined(__amd64__) || defined(__riscv) 81 #define MAX_PHYS_ADDR 0xFFFFFFFFFFFFFFFFull 82 #endif 83 84 struct region { 85 vm_paddr_t addr; 86 vm_size_t size; 87 uint32_t flags; 88 }; 89 90 static struct region hwregions[MAX_HWCNT]; 91 static struct region exregions[MAX_EXCNT]; 92 93 static size_t hwcnt; 94 static size_t excnt; 95 96 /* 97 * realmem is the total number of hardware pages, excluded or not. 98 * Maxmem is one greater than the last physical page number. 99 */ 100 long realmem; 101 long Maxmem; 102 103 #ifndef _KERNEL 104 static void 105 panic(const char *fmt, ...) 106 { 107 va_list va; 108 109 va_start(va, fmt); 110 vfprintf(stderr, fmt, va); 111 fprintf(stderr, "\n"); 112 va_end(va); 113 __builtin_trap(); 114 } 115 #endif 116 117 /* 118 * Print the contents of the physical and excluded region tables using the 119 * provided printf-like output function (which will be either printf or 120 * db_printf). 121 */ 122 static void 123 physmem_dump_tables(int (*prfunc)(const char *, ...) __printflike(1, 2)) 124 { 125 size_t i; 126 int flags; 127 uintmax_t addr, size; 128 const unsigned int mbyte = 1024 * 1024; 129 130 prfunc("Physical memory chunk(s):\n"); 131 for (i = 0; i < hwcnt; ++i) { 132 addr = hwregions[i].addr; 133 size = hwregions[i].size; 134 prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages)\n", addr, 135 addr + size - 1, size / mbyte, size / PAGE_SIZE); 136 } 137 138 prfunc("Excluded memory regions:\n"); 139 for (i = 0; i < excnt; ++i) { 140 addr = exregions[i].addr; 141 size = exregions[i].size; 142 flags = exregions[i].flags; 143 prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages) %s %s\n", 144 addr, addr + size - 1, size / mbyte, size / PAGE_SIZE, 145 (flags & EXFLAG_NOALLOC) ? "NoAlloc" : "", 146 (flags & EXFLAG_NODUMP) ? "NoDump" : ""); 147 } 148 149 #ifdef DEBUG 150 prfunc("Avail lists:\n"); 151 for (i = 0; phys_avail[i] != 0; ++i) { 152 prfunc(" phys_avail[%zu] 0x%08jx\n", i, 153 (uintmax_t)phys_avail[i]); 154 } 155 for (i = 0; dump_avail[i] != 0; ++i) { 156 prfunc(" dump_avail[%zu] 0x%08jx\n", i, 157 (uintmax_t)dump_avail[i]); 158 } 159 #endif 160 } 161 162 /* 163 * Print the contents of the static mapping table. Used for bootverbose. 164 */ 165 void 166 physmem_print_tables(void) 167 { 168 169 physmem_dump_tables(printf); 170 } 171 172 /* 173 * Walk the list of hardware regions, processing it against the list of 174 * exclusions that contain the given exflags, and generating an "avail list". 175 * 176 * If maxphyssz is not zero it sets upper limit, in bytes, for the total 177 * "avail list" size. Walk stops once the limit is reached and the last region 178 * is cut short if necessary. 179 * 180 * Updates the value at *pavail with the sum of all pages in all hw regions. 181 * 182 * Returns the number of pages of non-excluded memory added to the avail list. 183 */ 184 static size_t 185 regions_to_avail(vm_paddr_t *avail, uint32_t exflags, size_t maxavail, 186 uint64_t maxphyssz, long *pavail, long *prealmem) 187 { 188 size_t acnt, exi, hwi; 189 uint64_t adj, end, start, xend, xstart; 190 long availmem, totalmem; 191 const struct region *exp, *hwp; 192 uint64_t availsz; 193 194 totalmem = 0; 195 availmem = 0; 196 availsz = 0; 197 acnt = 0; 198 for (hwi = 0, hwp = hwregions; hwi < hwcnt; ++hwi, ++hwp) { 199 adj = round_page(hwp->addr) - hwp->addr; 200 start = round_page(hwp->addr); 201 end = trunc_page(hwp->size + adj) + start; 202 totalmem += atop((vm_offset_t)(end - start)); 203 for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { 204 /* 205 * If the excluded region does not match given flags, 206 * continue checking with the next excluded region. 207 */ 208 if ((exp->flags & exflags) == 0) 209 continue; 210 xstart = exp->addr; 211 xend = exp->size + xstart; 212 /* 213 * If the excluded region ends before this hw region, 214 * continue checking with the next excluded region. 215 */ 216 if (xend <= start) 217 continue; 218 /* 219 * If the excluded region begins after this hw region 220 * we're done because both lists are sorted. 221 */ 222 if (xstart >= end) 223 break; 224 /* 225 * If the excluded region completely covers this hw 226 * region, shrink this hw region to zero size. 227 */ 228 if ((start >= xstart) && (end <= xend)) { 229 start = xend; 230 end = xend; 231 break; 232 } 233 /* 234 * If the excluded region falls wholly within this hw 235 * region without abutting or overlapping the beginning 236 * or end, create an available entry from the leading 237 * fragment, then adjust the start of this hw region to 238 * the end of the excluded region, and continue checking 239 * the next excluded region because another exclusion 240 * could affect the remainder of this hw region. 241 */ 242 if ((xstart > start) && (xend < end)) { 243 244 if ((maxphyssz != 0) && 245 (availsz + xstart - start > maxphyssz)) { 246 xstart = maxphyssz + start - availsz; 247 } 248 if (xstart <= start) 249 continue; 250 if (acnt > 0 && 251 avail[acnt - 1] == (vm_paddr_t)start) { 252 avail[acnt - 1] = (vm_paddr_t)xstart; 253 } else { 254 avail[acnt++] = (vm_paddr_t)start; 255 avail[acnt++] = (vm_paddr_t)xstart; 256 } 257 availsz += (xstart - start); 258 availmem += atop((vm_offset_t)(xstart - start)); 259 start = xend; 260 continue; 261 } 262 /* 263 * We know the excluded region overlaps either the start 264 * or end of this hardware region (but not both), trim 265 * the excluded portion off the appropriate end. 266 */ 267 if (xstart <= start) 268 start = xend; 269 else 270 end = xstart; 271 } 272 /* 273 * If the trimming actions above left a non-zero size, create an 274 * available entry for it. 275 */ 276 if (end > start) { 277 if ((maxphyssz != 0) && 278 (availsz + end - start > maxphyssz)) { 279 end = maxphyssz + start - availsz; 280 } 281 if (end <= start) 282 break; 283 284 if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { 285 avail[acnt - 1] = (vm_paddr_t)end; 286 } else { 287 avail[acnt++] = (vm_paddr_t)start; 288 avail[acnt++] = (vm_paddr_t)end; 289 } 290 availsz += end - start; 291 availmem += atop((vm_offset_t)(end - start)); 292 } 293 if (acnt >= maxavail) 294 panic("Not enough space in the dump/phys_avail arrays"); 295 } 296 297 if (pavail != NULL) 298 *pavail = availmem; 299 if (prealmem != NULL) 300 *prealmem = totalmem; 301 return (acnt); 302 } 303 304 /* 305 * Check if the region at idx can be merged with the region above it. 306 */ 307 static size_t 308 merge_upper_regions(struct region *regions, size_t rcnt, size_t idx) 309 { 310 struct region *lower, *upper; 311 vm_paddr_t lend, uend; 312 size_t i, mergecnt, movecnt; 313 314 lower = ®ions[idx]; 315 lend = lower->addr + lower->size; 316 317 /* 318 * Continue merging in upper entries as long as we have entries to 319 * merge; the new block could have spanned more than one, although one 320 * is likely the common case. 321 */ 322 for (i = idx + 1; i < rcnt; i++) { 323 upper = ®ions[i]; 324 if (lend < upper->addr || lower->flags != upper->flags) 325 break; 326 327 uend = upper->addr + upper->size; 328 if (uend > lend) { 329 lower->size += uend - lend; 330 lend = lower->addr + lower->size; 331 } 332 333 if (uend >= lend) { 334 /* 335 * If we didn't move past the end of the upper region, 336 * then we don't need to bother checking for another 337 * merge because it would have been done already. Just 338 * increment i once more to maintain the invariant that 339 * i is one past the last entry merged. 340 */ 341 i++; 342 break; 343 } 344 } 345 346 /* 347 * We merged in the entries from [idx + 1, i); physically move the tail 348 * end at [i, rcnt) if we need to. 349 */ 350 mergecnt = i - (idx + 1); 351 if (mergecnt > 0) { 352 movecnt = rcnt - i; 353 if (movecnt == 0) { 354 /* Merged all the way to the end, just decrease rcnt. */ 355 rcnt = idx + 1; 356 } else { 357 memmove(®ions[idx + 1], ®ions[idx + mergecnt + 1], 358 movecnt * sizeof(*regions)); 359 rcnt -= mergecnt; 360 } 361 } 362 return (rcnt); 363 } 364 365 /* 366 * Insertion-sort a new entry into a regions list; sorted by start address. 367 */ 368 static size_t 369 insert_region(struct region *regions, size_t rcnt, vm_paddr_t addr, 370 vm_size_t size, uint32_t flags) 371 { 372 size_t i; 373 vm_paddr_t nend, rend; 374 struct region *ep, *rp; 375 376 nend = addr + size; 377 ep = regions + rcnt; 378 for (i = 0, rp = regions; i < rcnt; ++i, ++rp) { 379 rend = rp->addr + rp->size; 380 if (flags == rp->flags) { 381 if (addr <= rp->addr && nend >= rp->addr) { 382 /* 383 * New mapping overlaps at the beginning, shift 384 * for any difference in the beginning then 385 * shift if the new mapping extends past. 386 */ 387 rp->size += rp->addr - addr; 388 rp->addr = addr; 389 if (nend > rend) { 390 rp->size += nend - rend; 391 rcnt = merge_upper_regions(regions, 392 rcnt, i); 393 } 394 return (rcnt); 395 } else if (addr <= rend && nend > rp->addr) { 396 /* 397 * New mapping is either entirely contained 398 * within or it's overlapping at the end. 399 */ 400 if (nend > rend) { 401 rp->size += nend - rend; 402 rcnt = merge_upper_regions(regions, 403 rcnt, i); 404 } 405 return (rcnt); 406 } 407 } else if ((flags != 0) && (rp->flags != 0)) { 408 /* 409 * If we're duplicating an entry that already exists 410 * exactly, just upgrade its flags as needed. We could 411 * do more if we find that we have differently specified 412 * flags clipping existing excluding regions, but that's 413 * probably rare. 414 */ 415 if (addr == rp->addr && nend == rend) { 416 rp->flags |= flags; 417 return (rcnt); 418 } 419 } 420 421 if (addr < rp->addr) { 422 bcopy(rp, rp + 1, (ep - rp) * sizeof(*rp)); 423 break; 424 } 425 } 426 rp->addr = addr; 427 rp->size = size; 428 rp->flags = flags; 429 rcnt++; 430 431 return (rcnt); 432 } 433 434 /* 435 * Add a hardware memory region. 436 */ 437 void 438 physmem_hardware_region(uint64_t pa, uint64_t sz) 439 { 440 /* 441 * Filter out the page at PA 0x00000000. The VM can't handle it, as 442 * pmap_extract() == 0 means failure. 443 */ 444 if (pa == 0) { 445 if (sz <= PAGE_SIZE) 446 return; 447 pa = PAGE_SIZE; 448 sz -= PAGE_SIZE; 449 } else if (pa > MAX_PHYS_ADDR) { 450 /* This range is past usable memory, ignore it */ 451 return; 452 } 453 454 /* 455 * Also filter out the page at the end of the physical address space -- 456 * if addr is non-zero and addr+size is zero we wrapped to the next byte 457 * beyond what vm_paddr_t can express. That leads to a NULL pointer 458 * deref early in startup; work around it by leaving the last page out. 459 * 460 * XXX This just in: subtract out a whole megabyte, not just 1 page. 461 * Reducing the size by anything less than 1MB results in the NULL 462 * pointer deref in _vm_map_lock_read(). Better to give up a megabyte 463 * than leave some folks with an unusable system while we investigate. 464 */ 465 if ((pa + sz) > (MAX_PHYS_ADDR - 1024 * 1024)) { 466 sz = MAX_PHYS_ADDR - pa + 1; 467 if (sz <= 1024 * 1024) 468 return; 469 sz -= 1024 * 1024; 470 } 471 472 if (sz > 0 && hwcnt < nitems(hwregions)) 473 hwcnt = insert_region(hwregions, hwcnt, pa, sz, 0); 474 } 475 476 /* 477 * Add an exclusion region. 478 */ 479 void 480 physmem_exclude_region(vm_paddr_t pa, vm_size_t sz, uint32_t exflags) 481 { 482 vm_offset_t adj; 483 484 /* 485 * Truncate the starting address down to a page boundary, and round the 486 * ending page up to a page boundary. 487 */ 488 adj = pa - trunc_page(pa); 489 pa = trunc_page(pa); 490 sz = round_page(sz + adj); 491 492 if (excnt >= nitems(exregions)) 493 panic("failed to exclude region %#jx-%#jx", (uintmax_t)pa, 494 (uintmax_t)(pa + sz)); 495 excnt = insert_region(exregions, excnt, pa, sz, exflags); 496 } 497 498 size_t 499 physmem_avail(vm_paddr_t *avail, size_t maxavail) 500 { 501 502 return (regions_to_avail(avail, EXFLAG_NOALLOC, maxavail, 0, NULL, NULL)); 503 } 504 505 bool 506 physmem_excluded(vm_paddr_t pa, vm_size_t sz) 507 { 508 const struct region *exp; 509 size_t exi; 510 511 for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { 512 if (pa < exp->addr || pa + sz > exp->addr + exp->size) 513 continue; 514 return (true); 515 } 516 return (false); 517 } 518 519 #ifdef _KERNEL 520 /* 521 * Process all the regions added earlier into the global avail lists. 522 * 523 * Updates the kernel global 'physmem' with the number of physical pages 524 * available for use (all pages not in any exclusion region). 525 * 526 * Updates the kernel global 'Maxmem' with the page number one greater then the 527 * last page of physical memory in the system. 528 */ 529 void 530 physmem_init_kernel_globals(void) 531 { 532 size_t nextidx; 533 u_long hwphyssz; 534 535 hwphyssz = 0; 536 TUNABLE_ULONG_FETCH("hw.physmem", &hwphyssz); 537 538 regions_to_avail(dump_avail, EXFLAG_NODUMP, PHYS_AVAIL_ENTRIES, 539 hwphyssz, NULL, NULL); 540 nextidx = regions_to_avail(phys_avail, EXFLAG_NOALLOC, 541 PHYS_AVAIL_ENTRIES, hwphyssz, &physmem, &realmem); 542 if (nextidx == 0) 543 panic("No memory entries in phys_avail"); 544 Maxmem = atop(phys_avail[nextidx - 1]); 545 } 546 547 #ifdef DDB 548 #include <ddb/ddb.h> 549 550 DB_SHOW_COMMAND_FLAGS(physmem, db_show_physmem, DB_CMD_MEMSAFE) 551 { 552 553 physmem_dump_tables(db_printf); 554 } 555 556 #endif /* DDB */ 557 558 /* 559 * ram pseudo driver - this reserves I/O space resources corresponding to physical 560 * memory regions. 561 */ 562 563 static void 564 ram_identify(driver_t *driver, device_t parent) 565 { 566 567 if (resource_disabled("ram", 0)) 568 return; 569 if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL) 570 panic("ram_identify"); 571 } 572 573 static int 574 ram_probe(device_t dev) 575 { 576 577 device_quiet(dev); 578 device_set_desc(dev, "System RAM"); 579 return (BUS_PROBE_SPECIFIC); 580 } 581 582 static int 583 ram_attach(device_t dev) 584 { 585 vm_paddr_t avail_list[PHYS_AVAIL_COUNT]; 586 rman_res_t start, end; 587 struct region *hwp; 588 int rid, i; 589 590 rid = 0; 591 592 /* Get the avail list. */ 593 bzero(avail_list, sizeof(avail_list)); 594 regions_to_avail(avail_list, EXFLAG_NOALLOC | EXFLAG_NODUMP, 595 PHYS_AVAIL_COUNT, 0, NULL, NULL); 596 597 /* Reserve all memory regions. */ 598 for (i = 0; avail_list[i + 1] != 0; i += 2) { 599 start = avail_list[i]; 600 end = avail_list[i + 1]; 601 602 if (bootverbose) 603 device_printf(dev, 604 "reserving memory region: %jx-%jx\n", 605 (uintmax_t)start, (uintmax_t)end); 606 607 if (bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, start, end, 608 end - start, 0) == NULL) 609 panic("ram_attach: resource %d failed to attach", rid); 610 rid++; 611 } 612 613 /* Now, reserve the excluded memory regions. */ 614 for (i = 0, hwp = exregions; i < excnt; i++, hwp++) { 615 start = hwp->addr; 616 end = hwp->addr + hwp->size; 617 618 if (bootverbose) 619 device_printf(dev, 620 "reserving excluded region: %jx-%jx\n", 621 (uintmax_t)start, (uintmax_t)(end - 1)); 622 623 /* 624 * Best-effort attempt to reserve the range. This may fail, as 625 * sometimes the excluded ranges provided by the device tree 626 * will cover or overlap some I/O range. 627 */ 628 if (bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, start, end, 629 end - start, 0) == NULL) { 630 if (bootverbose) 631 device_printf(dev, "failed to reserve region\n"); 632 continue; 633 } 634 rid++; 635 } 636 637 return (0); 638 } 639 640 static device_method_t ram_methods[] = { 641 /* Device interface */ 642 DEVMETHOD(device_identify, ram_identify), 643 DEVMETHOD(device_probe, ram_probe), 644 DEVMETHOD(device_attach, ram_attach), 645 646 DEVMETHOD_END 647 }; 648 649 DEFINE_CLASS_0(ram, ram_driver, ram_methods, /* no softc */ 1); 650 DRIVER_MODULE(ram, nexus, ram_driver, 0, 0); 651 #endif /* _KERNEL */ 652