xref: /freebsd/sys/kern/subr_physmem.c (revision 8c99dfed541263bd51cae943684bcc13768c2a36)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2014 Ian Lepore <ian@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #ifdef _KERNEL
33 #include "opt_acpi.h"
34 #include "opt_ddb.h"
35 #endif
36 
37 /*
38  * Routines for describing and initializing anything related to physical memory.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/physmem.h>
45 
46 #ifdef _KERNEL
47 #include <vm/vm.h>
48 #include <vm/vm_param.h>
49 #include <vm/vm_page.h>
50 #include <vm/vm_phys.h>
51 #include <vm/vm_dumpset.h>
52 #include <machine/md_var.h>
53 #else
54 #include <stdarg.h>
55 #include <stdio.h>
56 #include <string.h>
57 #endif
58 
59 /*
60  * These structures are used internally to keep track of regions of physical
61  * ram, and regions within the physical ram that need to be excluded.  An
62  * exclusion region can be excluded from crash dumps, from the vm pool of pages
63  * that can be allocated, or both, depending on the exclusion flags associated
64  * with the region.
65  */
66 #ifdef DEV_ACPI
67 #define	MAX_HWCNT	32	/* ACPI needs more regions */
68 #define	MAX_EXCNT	32
69 #else
70 #define	MAX_HWCNT	16
71 #define	MAX_EXCNT	16
72 #endif
73 
74 #if defined(__arm__)
75 #define	MAX_PHYS_ADDR	0xFFFFFFFFull
76 #elif defined(__aarch64__) || defined(__riscv)
77 #define	MAX_PHYS_ADDR	0xFFFFFFFFFFFFFFFFull
78 #endif
79 
80 struct region {
81 	vm_paddr_t	addr;
82 	vm_size_t	size;
83 	uint32_t	flags;
84 };
85 
86 static struct region hwregions[MAX_HWCNT];
87 static struct region exregions[MAX_EXCNT];
88 
89 static size_t hwcnt;
90 static size_t excnt;
91 
92 /*
93  * realmem is the total number of hardware pages, excluded or not.
94  * Maxmem is one greater than the last physical page number.
95  */
96 long realmem;
97 long Maxmem;
98 
99 #ifndef _KERNEL
100 static void
101 panic(const char *fmt, ...)
102 {
103 	va_list va;
104 
105 	va_start(va, fmt);
106 	vfprintf(stderr, fmt, va);
107 	fprintf(stderr, "\n");
108 	va_end(va);
109 	__builtin_trap();
110 }
111 #endif
112 
113 /*
114  * Print the contents of the physical and excluded region tables using the
115  * provided printf-like output function (which will be either printf or
116  * db_printf).
117  */
118 static void
119 physmem_dump_tables(int (*prfunc)(const char *, ...))
120 {
121 	size_t i;
122 	int flags;
123 	uintmax_t addr, size;
124 	const unsigned int mbyte = 1024 * 1024;
125 
126 	prfunc("Physical memory chunk(s):\n");
127 	for (i = 0; i < hwcnt; ++i) {
128 		addr = hwregions[i].addr;
129 		size = hwregions[i].size;
130 		prfunc("  0x%08jx - 0x%08jx, %5ju MB (%7ju pages)\n", addr,
131 		    addr + size - 1, size / mbyte, size / PAGE_SIZE);
132 	}
133 
134 	prfunc("Excluded memory regions:\n");
135 	for (i = 0; i < excnt; ++i) {
136 		addr  = exregions[i].addr;
137 		size  = exregions[i].size;
138 		flags = exregions[i].flags;
139 		prfunc("  0x%08jx - 0x%08jx, %5ju MB (%7ju pages) %s %s\n",
140 		    addr, addr + size - 1, size / mbyte, size / PAGE_SIZE,
141 		    (flags & EXFLAG_NOALLOC) ? "NoAlloc" : "",
142 		    (flags & EXFLAG_NODUMP)  ? "NoDump" : "");
143 	}
144 
145 #ifdef DEBUG
146 	prfunc("Avail lists:\n");
147 	for (i = 0; phys_avail[i] != 0; ++i) {
148 		prfunc("  phys_avail[%d] 0x%08x\n", i, phys_avail[i]);
149 	}
150 	for (i = 0; dump_avail[i] != 0; ++i) {
151 		prfunc("  dump_avail[%d] 0x%08x\n", i, dump_avail[i]);
152 	}
153 #endif
154 }
155 
156 /*
157  * Print the contents of the static mapping table.  Used for bootverbose.
158  */
159 void
160 physmem_print_tables(void)
161 {
162 
163 	physmem_dump_tables(printf);
164 }
165 
166 /*
167  * Walk the list of hardware regions, processing it against the list of
168  * exclusions that contain the given exflags, and generating an "avail list".
169  *
170  * If maxphyssz is not zero it sets upper limit, in bytes, for the total
171  * "avail list" size. Walk stops once the limit is reached and the last region
172  * is cut short if necessary.
173  *
174  * Updates the value at *pavail with the sum of all pages in all hw regions.
175  *
176  * Returns the number of pages of non-excluded memory added to the avail list.
177  */
178 static size_t
179 regions_to_avail(vm_paddr_t *avail, uint32_t exflags, size_t maxavail,
180     uint64_t maxphyssz, long *pavail, long *prealmem)
181 {
182 	size_t acnt, exi, hwi;
183 	uint64_t end, start, xend, xstart;
184 	long availmem, totalmem;
185 	const struct region *exp, *hwp;
186 	uint64_t availsz;
187 
188 	totalmem = 0;
189 	availmem = 0;
190 	availsz = 0;
191 	acnt = 0;
192 	for (hwi = 0, hwp = hwregions; hwi < hwcnt; ++hwi, ++hwp) {
193 		start = hwp->addr;
194 		end   = hwp->size + start;
195 		totalmem += atop((vm_offset_t)(end - start));
196 		for (exi = 0, exp = exregions; exi < excnt; ++exi, ++exp) {
197 			/*
198 			 * If the excluded region does not match given flags,
199 			 * continue checking with the next excluded region.
200 			 */
201 			if ((exp->flags & exflags) == 0)
202 				continue;
203 			xstart = exp->addr;
204 			xend   = exp->size + xstart;
205 			/*
206 			 * If the excluded region ends before this hw region,
207 			 * continue checking with the next excluded region.
208 			 */
209 			if (xend <= start)
210 				continue;
211 			/*
212 			 * If the excluded region begins after this hw region
213 			 * we're done because both lists are sorted.
214 			 */
215 			if (xstart >= end)
216 				break;
217 			/*
218 			 * If the excluded region completely covers this hw
219 			 * region, shrink this hw region to zero size.
220 			 */
221 			if ((start >= xstart) && (end <= xend)) {
222 				start = xend;
223 				end = xend;
224 				break;
225 			}
226 			/*
227 			 * If the excluded region falls wholly within this hw
228 			 * region without abutting or overlapping the beginning
229 			 * or end, create an available entry from the leading
230 			 * fragment, then adjust the start of this hw region to
231 			 * the end of the excluded region, and continue checking
232 			 * the next excluded region because another exclusion
233 			 * could affect the remainder of this hw region.
234 			 */
235 			if ((xstart > start) && (xend < end)) {
236 
237 				if ((maxphyssz != 0) &&
238 				    (availsz + xstart - start > maxphyssz)) {
239 					xstart = maxphyssz + start - availsz;
240 				}
241 				if (xstart <= start)
242 					continue;
243 				if (acnt > 0 &&
244 				    avail[acnt - 1] == (vm_paddr_t)start) {
245 					avail[acnt - 1] = (vm_paddr_t)xstart;
246 				} else {
247 					avail[acnt++] = (vm_paddr_t)start;
248 					avail[acnt++] = (vm_paddr_t)xstart;
249 				}
250 				availsz += (xstart - start);
251 				availmem += atop((vm_offset_t)(xstart - start));
252 				start = xend;
253 				continue;
254 			}
255 			/*
256 			 * We know the excluded region overlaps either the start
257 			 * or end of this hardware region (but not both), trim
258 			 * the excluded portion off the appropriate end.
259 			 */
260 			if (xstart <= start)
261 				start = xend;
262 			else
263 				end = xstart;
264 		}
265 		/*
266 		 * If the trimming actions above left a non-zero size, create an
267 		 * available entry for it.
268 		 */
269 		if (end > start) {
270 			if ((maxphyssz != 0) &&
271 			    (availsz + end - start > maxphyssz)) {
272 				end = maxphyssz + start - availsz;
273 			}
274 			if (end <= start)
275 				break;
276 
277 			if (acnt > 0 && avail[acnt - 1] == (vm_paddr_t)start) {
278 				avail[acnt - 1] = (vm_paddr_t)end;
279 			} else {
280 				avail[acnt++] = (vm_paddr_t)start;
281 				avail[acnt++] = (vm_paddr_t)end;
282 			}
283 			availsz += end - start;
284 			availmem += atop((vm_offset_t)(end - start));
285 		}
286 		if (acnt >= maxavail)
287 			panic("Not enough space in the dump/phys_avail arrays");
288 	}
289 
290 	if (pavail != NULL)
291 		*pavail = availmem;
292 	if (prealmem != NULL)
293 		*prealmem = totalmem;
294 	return (acnt);
295 }
296 
297 /*
298  * Check if the region at idx can be merged with the region above it.
299  */
300 static size_t
301 merge_upper_regions(struct region *regions, size_t rcnt, size_t idx)
302 {
303 	struct region *lower, *upper;
304 	vm_paddr_t lend, uend;
305 	size_t i, mergecnt, movecnt;
306 
307 	lower = &regions[idx];
308 	lend = lower->addr + lower->size;
309 
310 	/*
311 	 * Continue merging in upper entries as long as we have entries to
312 	 * merge; the new block could have spanned more than one, although one
313 	 * is likely the common case.
314 	 */
315 	for (i = idx + 1; i < rcnt; i++) {
316 		upper = &regions[i];
317 		if (lend < upper->addr || lower->flags != upper->flags)
318 			break;
319 
320 		uend = upper->addr + upper->size;
321 		if (uend > lend) {
322 			lower->size += uend - lend;
323 			lend = lower->addr + lower->size;
324 		}
325 
326 		if (uend >= lend) {
327 			/*
328 			 * If we didn't move past the end of the upper region,
329 			 * then we don't need to bother checking for another
330 			 * merge because it would have been done already.  Just
331 			 * increment i once more to maintain the invariant that
332 			 * i is one past the last entry merged.
333 			 */
334 			i++;
335 			break;
336 		}
337 	}
338 
339 	/*
340 	 * We merged in the entries from [idx + 1, i); physically move the tail
341 	 * end at [i, rcnt) if we need to.
342 	 */
343 	mergecnt = i - (idx + 1);
344 	if (mergecnt > 0) {
345 		movecnt = rcnt - i;
346 		if (movecnt == 0) {
347 			/* Merged all the way to the end, just decrease rcnt. */
348 			rcnt = idx + 1;
349 		} else {
350 			memmove(&regions[idx + 1], &regions[idx + mergecnt + 1],
351 			    movecnt * sizeof(*regions));
352 			rcnt -= mergecnt;
353 		}
354 	}
355 	return (rcnt);
356 }
357 
358 /*
359  * Insertion-sort a new entry into a regions list; sorted by start address.
360  */
361 static size_t
362 insert_region(struct region *regions, size_t rcnt, vm_paddr_t addr,
363     vm_size_t size, uint32_t flags)
364 {
365 	size_t i;
366 	vm_paddr_t nend, rend;
367 	struct region *ep, *rp;
368 
369 	nend = addr + size;
370 	ep = regions + rcnt;
371 	for (i = 0, rp = regions; i < rcnt; ++i, ++rp) {
372 		if (flags == rp->flags) {
373 			rend = rp->addr + rp->size;
374 			if (addr <= rp->addr && nend >= rp->addr) {
375 				/*
376 				 * New mapping overlaps at the beginning, shift
377 				 * for any difference in the beginning then
378 				 * shift if the new mapping extends past.
379 				 */
380 				rp->size += rp->addr - addr;
381 				rp->addr = addr;
382 				if (nend > rend) {
383 					rp->size += nend - rend;
384 					rcnt = merge_upper_regions(regions,
385 					    rcnt, i);
386 				}
387 				return (rcnt);
388 			} else if (addr <= rend && nend > rp->addr) {
389 				/*
390 				 * New mapping is either entirely contained
391 				 * within or it's overlapping at the end.
392 				 */
393 				if (nend > rend) {
394 					rp->size += nend - rend;
395 					rcnt = merge_upper_regions(regions,
396 					    rcnt, i);
397 				}
398 				return (rcnt);
399 			}
400 		}
401 		if (addr < rp->addr) {
402 			bcopy(rp, rp + 1, (ep - rp) * sizeof(*rp));
403 			break;
404 		}
405 	}
406 	rp->addr  = addr;
407 	rp->size  = size;
408 	rp->flags = flags;
409 	rcnt++;
410 
411 	return (rcnt);
412 }
413 
414 /*
415  * Add a hardware memory region.
416  */
417 void
418 physmem_hardware_region(uint64_t pa, uint64_t sz)
419 {
420 	vm_offset_t adj;
421 
422 	/*
423 	 * Filter out the page at PA 0x00000000.  The VM can't handle it, as
424 	 * pmap_extract() == 0 means failure.
425 	 */
426 	if (pa == 0) {
427 		if (sz <= PAGE_SIZE)
428 			return;
429 		pa  = PAGE_SIZE;
430 		sz -= PAGE_SIZE;
431 	} else if (pa > MAX_PHYS_ADDR) {
432 		/* This range is past usable memory, ignore it */
433 		return;
434 	}
435 
436 	/*
437 	 * Also filter out the page at the end of the physical address space --
438 	 * if addr is non-zero and addr+size is zero we wrapped to the next byte
439 	 * beyond what vm_paddr_t can express.  That leads to a NULL pointer
440 	 * deref early in startup; work around it by leaving the last page out.
441 	 *
442 	 * XXX This just in:  subtract out a whole megabyte, not just 1 page.
443 	 * Reducing the size by anything less than 1MB results in the NULL
444 	 * pointer deref in _vm_map_lock_read().  Better to give up a megabyte
445 	 * than leave some folks with an unusable system while we investigate.
446 	 */
447 	if ((pa + sz) > (MAX_PHYS_ADDR - 1024 * 1024)) {
448 		sz = MAX_PHYS_ADDR - pa + 1;
449 		if (sz <= 1024 * 1024)
450 			return;
451 		sz -= 1024 * 1024;
452 	}
453 
454 	/*
455 	 * Round the starting address up to a page boundary, and truncate the
456 	 * ending page down to a page boundary.
457 	 */
458 	adj = round_page(pa) - pa;
459 	pa  = round_page(pa);
460 	sz  = trunc_page(sz - adj);
461 
462 	if (sz > 0 && hwcnt < nitems(hwregions))
463 		hwcnt = insert_region(hwregions, hwcnt, pa, sz, 0);
464 }
465 
466 /*
467  * Add an exclusion region.
468  */
469 void
470 physmem_exclude_region(vm_paddr_t pa, vm_size_t sz, uint32_t exflags)
471 {
472 	vm_offset_t adj;
473 
474 	/*
475 	 * Truncate the starting address down to a page boundary, and round the
476 	 * ending page up to a page boundary.
477 	 */
478 	adj = pa - trunc_page(pa);
479 	pa  = trunc_page(pa);
480 	sz  = round_page(sz + adj);
481 
482 	if (excnt >= nitems(exregions))
483 		panic("failed to exclude region %#jx-%#jx", (uintmax_t)pa,
484 		    (uintmax_t)(pa + sz));
485 	excnt = insert_region(exregions, excnt, pa, sz, exflags);
486 }
487 
488 size_t
489 physmem_avail(vm_paddr_t *avail, size_t maxavail)
490 {
491 
492 	return (regions_to_avail(avail, EXFLAG_NOALLOC, maxavail, 0, NULL, NULL));
493 }
494 
495 #ifdef _KERNEL
496 /*
497  * Process all the regions added earlier into the global avail lists.
498  *
499  * Updates the kernel global 'physmem' with the number of physical pages
500  * available for use (all pages not in any exclusion region).
501  *
502  * Updates the kernel global 'Maxmem' with the page number one greater then the
503  * last page of physical memory in the system.
504  */
505 void
506 physmem_init_kernel_globals(void)
507 {
508 	size_t nextidx;
509 	u_long hwphyssz;
510 
511 	hwphyssz = 0;
512 	TUNABLE_ULONG_FETCH("hw.physmem", &hwphyssz);
513 
514 	regions_to_avail(dump_avail, EXFLAG_NODUMP, PHYS_AVAIL_ENTRIES,
515 	    hwphyssz, NULL, NULL);
516 	nextidx = regions_to_avail(phys_avail, EXFLAG_NOALLOC,
517 	    PHYS_AVAIL_ENTRIES, hwphyssz, &physmem, &realmem);
518 	if (nextidx == 0)
519 		panic("No memory entries in phys_avail");
520 	Maxmem = atop(phys_avail[nextidx - 1]);
521 }
522 #endif
523 
524 #ifdef DDB
525 #include <ddb/ddb.h>
526 
527 DB_SHOW_COMMAND(physmem, db_show_physmem)
528 {
529 
530 	physmem_dump_tables(db_printf);
531 }
532 
533 #endif /* DDB */
534