1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2014 Ian Lepore <ian@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #ifdef _KERNEL 33 #include "opt_acpi.h" 34 #include "opt_ddb.h" 35 #endif 36 37 /* 38 * Routines for describing and initializing anything related to physical memory. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/physmem.h> 45 46 #ifdef _KERNEL 47 #include <vm/vm.h> 48 #include <vm/vm_param.h> 49 #include <vm/vm_page.h> 50 #include <vm/vm_phys.h> 51 #include <vm/vm_dumpset.h> 52 #include <machine/md_var.h> 53 #else 54 #include <stdarg.h> 55 #include <stdio.h> 56 #include <string.h> 57 #endif 58 59 /* 60 * These structures are used internally to keep track of regions of physical 61 * ram, and regions within the physical ram that need to be excluded. An 62 * exclusion region can be excluded from crash dumps, from the vm pool of pages 63 * that can be allocated, or both, depending on the exclusion flags associated 64 * with the region. 65 */ 66 #ifdef DEV_ACPI 67 #define MAX_HWCNT 32 /* ACPI needs more regions */ 68 #define MAX_EXCNT 32 69 #else 70 #define MAX_HWCNT 16 71 #define MAX_EXCNT 16 72 #endif 73 74 #if defined(__arm__) 75 #define MAX_PHYS_ADDR 0xFFFFFFFFull 76 #elif defined(__aarch64__) || defined(__amd64__) || defined(__riscv) 77 #define MAX_PHYS_ADDR 0xFFFFFFFFFFFFFFFFull 78 #endif 79 80 struct region { 81 vm_paddr_t addr; 82 vm_size_t size; 83 uint32_t flags; 84 }; 85 86 static struct region hwregions[MAX_HWCNT]; 87 static struct region exregions[MAX_EXCNT]; 88 89 static size_t hwcnt; 90 static size_t excnt; 91 92 /* 93 * realmem is the total number of hardware pages, excluded or not. 94 * Maxmem is one greater than the last physical page number. 95 */ 96 long realmem; 97 long Maxmem; 98 99 #ifndef _KERNEL 100 static void 101 panic(const char *fmt, ...) 102 { 103 va_list va; 104 105 va_start(va, fmt); 106 vfprintf(stderr, fmt, va); 107 fprintf(stderr, "\n"); 108 va_end(va); 109 __builtin_trap(); 110 } 111 #endif 112 113 /* 114 * Print the contents of the physical and excluded region tables using the 115 * provided printf-like output function (which will be either printf or 116 * db_printf). 117 */ 118 static void 119 physmem_dump_tables(int (*prfunc)(const char *, ...)) 120 { 121 size_t i; 122 int flags; 123 uintmax_t addr, size; 124 const unsigned int mbyte = 1024 * 1024; 125 126 prfunc("Physical memory chunk(s):\n"); 127 for (i = 0; i < hwcnt; ++i) { 128 addr = hwregions[i].addr; 129 size = hwregions[i].size; 130 prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages)\n", addr, 131 addr + size - 1, size / mbyte, size / PAGE_SIZE); 132 } 133 134 prfunc("Excluded memory regions:\n"); 135 for (i = 0; i < excnt; ++i) { 136 addr = exregions[i].addr; 137 size = exregions[i].size; 138 flags = exregions[i].flags; 139 prfunc(" 0x%08jx - 0x%08jx, %5ju MB (%7ju pages) %s %s\n", 140 addr, addr + size - 1, size / mbyte, size / PAGE_SIZE, 141 (flags & EXFLAG_NOALLOC) ? "NoAlloc" : "", 142 (flags & EXFLAG_NODUMP) ? "NoDump" : ""); 143 } 144 145 #ifdef DEBUG 146 prfunc("Avail lists:\n"); 147 for (i = 0; phys_avail[i] != 0; ++i) { 148 prfunc(" phys_avail[%d] 0x%08x\n", i, phys_avail[i]); 149 } 150 for (i = 0; dump_avail[i] != 0; ++i) { 151 prfunc(" dump_avail[%d] 0x%08x\n", i, dump_avail[i]); 152 } 153 #endif 154 } 155 156 /* 157 * Print the contents of the static mapping table. Used for bootverbose. 158 */ 159 void 160 physmem_print_tables(void) 161 { 162 163 physmem_dump_tables(printf); 164 } 165 166 /* 167 * Walk the list of hardware regions, processing it against the list of 168 * exclusions that contain the given exflags, and generating an "avail list". 169 * 170 * If maxphyssz is not zero it sets upper limit, in bytes, for the total 171 * "avail list" size. Walk stops once the limit is reached and the last region 172 * is cut short if necessary. 173 * 174 * Updates the value at *pavail with the sum of all pages in all hw regions. 175 * 176 * Returns the number of pages of non-excluded memory added to the avail list. 177 */ 178 static size_t 179 regions_to_avail(vm_paddr_t *avail, uint32_t exflags, size_t maxavail, 180 uint64_t maxphyssz, long *pavail, long *prealmem) 181 { 182 size_t acnt, exi, hwi; 183 uint64_t adj, end, start, xend, xstart; 184 long availmem, totalmem; 185 const struct region *exp, *hwp; 186 uint64_t availsz; 187 188 totalmem = 0; 189 availmem = 0; 190 availsz = 0; 191 acnt = 0; 192 for (hwi = 0, hwp = hwregions; hwi < hwcnt; ++hwi, ++hwp) { 193 adj = round_page(hwp->addr) - hwp->addr; 194 start = round_page(hwp->addr); 195 end = trunc_page(hwp->size + adj) + start; 196 totalmem += atop((vm_offset_t)(end - start)); 197 for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { 198 /* 199 * If the excluded region does not match given flags, 200 * continue checking with the next excluded region. 201 */ 202 if ((exp->flags & exflags) == 0) 203 continue; 204 xstart = exp->addr; 205 xend = exp->size + xstart; 206 /* 207 * If the excluded region ends before this hw region, 208 * continue checking with the next excluded region. 209 */ 210 if (xend <= start) 211 continue; 212 /* 213 * If the excluded region begins after this hw region 214 * we're done because both lists are sorted. 215 */ 216 if (xstart >= end) 217 break; 218 /* 219 * If the excluded region completely covers this hw 220 * region, shrink this hw region to zero size. 221 */ 222 if ((start >= xstart) && (end <= xend)) { 223 start = xend; 224 end = xend; 225 break; 226 } 227 /* 228 * If the excluded region falls wholly within this hw 229 * region without abutting or overlapping the beginning 230 * or end, create an available entry from the leading 231 * fragment, then adjust the start of this hw region to 232 * the end of the excluded region, and continue checking 233 * the next excluded region because another exclusion 234 * could affect the remainder of this hw region. 235 */ 236 if ((xstart > start) && (xend < end)) { 237 238 if ((maxphyssz != 0) && 239 (availsz + xstart - start > maxphyssz)) { 240 xstart = maxphyssz + start - availsz; 241 } 242 if (xstart <= start) 243 continue; 244 if (acnt > 0 && 245 avail[acnt - 1] == (vm_paddr_t)start) { 246 avail[acnt - 1] = (vm_paddr_t)xstart; 247 } else { 248 avail[acnt++] = (vm_paddr_t)start; 249 avail[acnt++] = (vm_paddr_t)xstart; 250 } 251 availsz += (xstart - start); 252 availmem += atop((vm_offset_t)(xstart - start)); 253 start = xend; 254 continue; 255 } 256 /* 257 * We know the excluded region overlaps either the start 258 * or end of this hardware region (but not both), trim 259 * the excluded portion off the appropriate end. 260 */ 261 if (xstart <= start) 262 start = xend; 263 else 264 end = xstart; 265 } 266 /* 267 * If the trimming actions above left a non-zero size, create an 268 * available entry for it. 269 */ 270 if (end > start) { 271 if ((maxphyssz != 0) && 272 (availsz + end - start > maxphyssz)) { 273 end = maxphyssz + start - availsz; 274 } 275 if (end <= start) 276 break; 277 278 if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) { 279 avail[acnt - 1] = (vm_paddr_t)end; 280 } else { 281 avail[acnt++] = (vm_paddr_t)start; 282 avail[acnt++] = (vm_paddr_t)end; 283 } 284 availsz += end - start; 285 availmem += atop((vm_offset_t)(end - start)); 286 } 287 if (acnt >= maxavail) 288 panic("Not enough space in the dump/phys_avail arrays"); 289 } 290 291 if (pavail != NULL) 292 *pavail = availmem; 293 if (prealmem != NULL) 294 *prealmem = totalmem; 295 return (acnt); 296 } 297 298 /* 299 * Check if the region at idx can be merged with the region above it. 300 */ 301 static size_t 302 merge_upper_regions(struct region *regions, size_t rcnt, size_t idx) 303 { 304 struct region *lower, *upper; 305 vm_paddr_t lend, uend; 306 size_t i, mergecnt, movecnt; 307 308 lower = ®ions[idx]; 309 lend = lower->addr + lower->size; 310 311 /* 312 * Continue merging in upper entries as long as we have entries to 313 * merge; the new block could have spanned more than one, although one 314 * is likely the common case. 315 */ 316 for (i = idx + 1; i < rcnt; i++) { 317 upper = ®ions[i]; 318 if (lend < upper->addr || lower->flags != upper->flags) 319 break; 320 321 uend = upper->addr + upper->size; 322 if (uend > lend) { 323 lower->size += uend - lend; 324 lend = lower->addr + lower->size; 325 } 326 327 if (uend >= lend) { 328 /* 329 * If we didn't move past the end of the upper region, 330 * then we don't need to bother checking for another 331 * merge because it would have been done already. Just 332 * increment i once more to maintain the invariant that 333 * i is one past the last entry merged. 334 */ 335 i++; 336 break; 337 } 338 } 339 340 /* 341 * We merged in the entries from [idx + 1, i); physically move the tail 342 * end at [i, rcnt) if we need to. 343 */ 344 mergecnt = i - (idx + 1); 345 if (mergecnt > 0) { 346 movecnt = rcnt - i; 347 if (movecnt == 0) { 348 /* Merged all the way to the end, just decrease rcnt. */ 349 rcnt = idx + 1; 350 } else { 351 memmove(®ions[idx + 1], ®ions[idx + mergecnt + 1], 352 movecnt * sizeof(*regions)); 353 rcnt -= mergecnt; 354 } 355 } 356 return (rcnt); 357 } 358 359 /* 360 * Insertion-sort a new entry into a regions list; sorted by start address. 361 */ 362 static size_t 363 insert_region(struct region *regions, size_t rcnt, vm_paddr_t addr, 364 vm_size_t size, uint32_t flags) 365 { 366 size_t i; 367 vm_paddr_t nend, rend; 368 struct region *ep, *rp; 369 370 nend = addr + size; 371 ep = regions + rcnt; 372 for (i = 0, rp = regions; i < rcnt; ++i, ++rp) { 373 if (flags == rp->flags) { 374 rend = rp->addr + rp->size; 375 if (addr <= rp->addr && nend >= rp->addr) { 376 /* 377 * New mapping overlaps at the beginning, shift 378 * for any difference in the beginning then 379 * shift if the new mapping extends past. 380 */ 381 rp->size += rp->addr - addr; 382 rp->addr = addr; 383 if (nend > rend) { 384 rp->size += nend - rend; 385 rcnt = merge_upper_regions(regions, 386 rcnt, i); 387 } 388 return (rcnt); 389 } else if (addr <= rend && nend > rp->addr) { 390 /* 391 * New mapping is either entirely contained 392 * within or it's overlapping at the end. 393 */ 394 if (nend > rend) { 395 rp->size += nend - rend; 396 rcnt = merge_upper_regions(regions, 397 rcnt, i); 398 } 399 return (rcnt); 400 } 401 } 402 if (addr < rp->addr) { 403 bcopy(rp, rp + 1, (ep - rp) * sizeof(*rp)); 404 break; 405 } 406 } 407 rp->addr = addr; 408 rp->size = size; 409 rp->flags = flags; 410 rcnt++; 411 412 return (rcnt); 413 } 414 415 /* 416 * Add a hardware memory region. 417 */ 418 void 419 physmem_hardware_region(uint64_t pa, uint64_t sz) 420 { 421 /* 422 * Filter out the page at PA 0x00000000. The VM can't handle it, as 423 * pmap_extract() == 0 means failure. 424 */ 425 if (pa == 0) { 426 if (sz <= PAGE_SIZE) 427 return; 428 pa = PAGE_SIZE; 429 sz -= PAGE_SIZE; 430 } else if (pa > MAX_PHYS_ADDR) { 431 /* This range is past usable memory, ignore it */ 432 return; 433 } 434 435 /* 436 * Also filter out the page at the end of the physical address space -- 437 * if addr is non-zero and addr+size is zero we wrapped to the next byte 438 * beyond what vm_paddr_t can express. That leads to a NULL pointer 439 * deref early in startup; work around it by leaving the last page out. 440 * 441 * XXX This just in: subtract out a whole megabyte, not just 1 page. 442 * Reducing the size by anything less than 1MB results in the NULL 443 * pointer deref in _vm_map_lock_read(). Better to give up a megabyte 444 * than leave some folks with an unusable system while we investigate. 445 */ 446 if ((pa + sz) > (MAX_PHYS_ADDR - 1024 * 1024)) { 447 sz = MAX_PHYS_ADDR - pa + 1; 448 if (sz <= 1024 * 1024) 449 return; 450 sz -= 1024 * 1024; 451 } 452 453 if (sz > 0 && hwcnt < nitems(hwregions)) 454 hwcnt = insert_region(hwregions, hwcnt, pa, sz, 0); 455 } 456 457 /* 458 * Add an exclusion region. 459 */ 460 void 461 physmem_exclude_region(vm_paddr_t pa, vm_size_t sz, uint32_t exflags) 462 { 463 vm_offset_t adj; 464 465 /* 466 * Truncate the starting address down to a page boundary, and round the 467 * ending page up to a page boundary. 468 */ 469 adj = pa - trunc_page(pa); 470 pa = trunc_page(pa); 471 sz = round_page(sz + adj); 472 473 if (excnt >= nitems(exregions)) 474 panic("failed to exclude region %#jx-%#jx", (uintmax_t)pa, 475 (uintmax_t)(pa + sz)); 476 excnt = insert_region(exregions, excnt, pa, sz, exflags); 477 } 478 479 size_t 480 physmem_avail(vm_paddr_t *avail, size_t maxavail) 481 { 482 483 return (regions_to_avail(avail, EXFLAG_NOALLOC, maxavail, 0, NULL, NULL)); 484 } 485 486 bool 487 physmem_excluded(vm_paddr_t pa, vm_size_t sz) 488 { 489 const struct region *exp; 490 size_t exi; 491 492 for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) { 493 if (pa < exp->addr || pa + sz > exp->addr + exp->size) 494 continue; 495 return (true); 496 } 497 return (false); 498 } 499 500 #ifdef _KERNEL 501 /* 502 * Process all the regions added earlier into the global avail lists. 503 * 504 * Updates the kernel global 'physmem' with the number of physical pages 505 * available for use (all pages not in any exclusion region). 506 * 507 * Updates the kernel global 'Maxmem' with the page number one greater then the 508 * last page of physical memory in the system. 509 */ 510 void 511 physmem_init_kernel_globals(void) 512 { 513 size_t nextidx; 514 u_long hwphyssz; 515 516 hwphyssz = 0; 517 TUNABLE_ULONG_FETCH("hw.physmem", &hwphyssz); 518 519 regions_to_avail(dump_avail, EXFLAG_NODUMP, PHYS_AVAIL_ENTRIES, 520 hwphyssz, NULL, NULL); 521 nextidx = regions_to_avail(phys_avail, EXFLAG_NOALLOC, 522 PHYS_AVAIL_ENTRIES, hwphyssz, &physmem, &realmem); 523 if (nextidx == 0) 524 panic("No memory entries in phys_avail"); 525 Maxmem = atop(phys_avail[nextidx - 1]); 526 } 527 #endif 528 529 #ifdef DDB 530 #include <ddb/ddb.h> 531 532 DB_SHOW_COMMAND_FLAGS(physmem, db_show_physmem, DB_CMD_MEMSAFE) 533 { 534 535 physmem_dump_tables(db_printf); 536 } 537 538 #endif /* DDB */ 539