xref: /freebsd/usr.sbin/bhyve/amd64/e820.c (revision 08139140c5f96fd9deb7a8de7a534bccf9a1d0c8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG
5  * Author: Corvin Köhne <c.koehne@beckhoff.com>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/queue.h>
10 
11 #include <machine/vmm.h>
12 
13 #include <assert.h>
14 #include <err.h>
15 #include <errno.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "debug.h"
21 #include "e820.h"
22 #include "qemu_fwcfg.h"
23 
24 /*
25  * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it
26  * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't
27  * hold all possible physical addresses and we can get into trouble.
28  */
29 static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t),
30     "Unable to represent physical memory by E820 table");
31 
32 #define E820_FWCFG_FILE_NAME "etc/e820"
33 
34 #define KB (1024UL)
35 #define MB (1024 * KB)
36 #define GB (1024 * MB)
37 
38 /*
39  * Fix E820 memory holes:
40  * [    A0000,    C0000) VGA
41  * [    C0000,   100000) ROM
42  */
43 #define E820_VGA_MEM_BASE 0xA0000
44 #define E820_VGA_MEM_END 0xC0000
45 #define E820_ROM_MEM_BASE 0xC0000
46 #define E820_ROM_MEM_END 0x100000
47 
48 struct e820_element {
49 	TAILQ_ENTRY(e820_element) chain;
50 	uint64_t base;
51 	uint64_t end;
52 	enum e820_memory_type type;
53 };
54 static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER(
55     e820_table);
56 
57 static struct e820_element *
e820_element_alloc(uint64_t base,uint64_t end,enum e820_memory_type type)58 e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type)
59 {
60 	struct e820_element *element;
61 
62 	element = calloc(1, sizeof(*element));
63 	if (element == NULL) {
64 		return (NULL);
65 	}
66 
67 	element->base = base;
68 	element->end = end;
69 	element->type = type;
70 
71 	return (element);
72 }
73 
74 static const char *
e820_get_type_name(const enum e820_memory_type type)75 e820_get_type_name(const enum e820_memory_type type)
76 {
77 	switch (type) {
78 	case E820_TYPE_MEMORY:
79 		return ("RAM");
80 	case E820_TYPE_RESERVED:
81 		return ("Reserved");
82 	case E820_TYPE_ACPI:
83 		return ("ACPI");
84 	case E820_TYPE_NVS:
85 		return ("NVS");
86 	default:
87 		return ("Unknown");
88 	}
89 }
90 
91 void
e820_dump_table(void)92 e820_dump_table(void)
93 {
94 	struct e820_element *element;
95 	uint64_t i;
96 
97 	EPRINTLN("E820 map:");
98 
99 	i = 0;
100 	TAILQ_FOREACH(element, &e820_table, chain) {
101 		EPRINTLN("  (%4lu) [%16lx, %16lx] %s", i,
102 		    element->base, element->end,
103 		    e820_get_type_name(element->type));
104 
105 		++i;
106 	}
107 }
108 
109 static struct qemu_fwcfg_item *
e820_get_fwcfg_item(void)110 e820_get_fwcfg_item(void)
111 {
112 	struct qemu_fwcfg_item *fwcfg_item;
113 	struct e820_element *element;
114 	struct e820_entry *entries;
115 	int count, i;
116 
117 	count = 0;
118 	TAILQ_FOREACH(element, &e820_table, chain) {
119 		++count;
120 	}
121 	if (count == 0) {
122 		warnx("%s: E820 table empty", __func__);
123 		return (NULL);
124 	}
125 
126 	fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item));
127 	if (fwcfg_item == NULL) {
128 		return (NULL);
129 	}
130 
131 	fwcfg_item->size = count * sizeof(struct e820_entry);
132 	fwcfg_item->data = calloc(count, sizeof(struct e820_entry));
133 	if (fwcfg_item->data == NULL) {
134 		free(fwcfg_item);
135 		return (NULL);
136 	}
137 
138 	i = 0;
139 	entries = (struct e820_entry *)fwcfg_item->data;
140 	TAILQ_FOREACH(element, &e820_table, chain) {
141 		struct e820_entry *entry = &entries[i];
142 
143 		entry->base = element->base;
144 		entry->length = element->end - element->base;
145 		entry->type = element->type;
146 
147 		++i;
148 	}
149 
150 	return (fwcfg_item);
151 }
152 
153 static int
e820_add_entry(const uint64_t base,const uint64_t end,const enum e820_memory_type type)154 e820_add_entry(const uint64_t base, const uint64_t end,
155     const enum e820_memory_type type)
156 {
157 	struct e820_element *new_element;
158 	struct e820_element *element;
159 	struct e820_element *sib_element;
160 	struct e820_element *ram_element;
161 
162 	assert(end >= base);
163 
164 	new_element = e820_element_alloc(base, end, type);
165 	if (new_element == NULL) {
166 		return (ENOMEM);
167 	}
168 
169 	/*
170 	 * E820 table should always be sorted in ascending order. Therefore,
171 	 * search for a range whose end is larger than the base parameter.
172 	 */
173 	TAILQ_FOREACH(element, &e820_table, chain) {
174 		if (element->end > base) {
175 			break;
176 		}
177 	}
178 
179 	/*
180 	 * System memory requires special handling.
181 	 */
182 	if (type == E820_TYPE_MEMORY) {
183 		/*
184 		 * base is larger than of any existing element. Add new system
185 		 * memory at the end of the table.
186 		 */
187 		if (element == NULL) {
188 			TAILQ_INSERT_TAIL(&e820_table, new_element, chain);
189 			return (0);
190 		}
191 
192 		/*
193 		 * System memory shouldn't overlap with any existing element.
194 		 */
195 		assert(end >= element->base);
196 
197 		TAILQ_INSERT_BEFORE(element, new_element, chain);
198 
199 		return (0);
200 	}
201 
202 	/*
203 	 * If some one tries to allocate a specific address, it could happen, that
204 	 * this address is not allocatable. Therefore, do some checks. If the
205 	 * address is not allocatable, don't panic. The user may have a fallback and
206 	 * tries to allocate another address. This is true for the GVT-d emulation
207 	 * which tries to reuse the host address of the graphics stolen memory and
208 	 * falls back to allocating the highest address below 4 GB.
209 	 */
210 	if (element == NULL || element->type != E820_TYPE_MEMORY ||
211 	    (base < element->base || end > element->end))
212 		return (ENOMEM);
213 
214 	if (base == element->base && end == element->end) {
215 		/*
216 		 * The new entry replaces an existing one.
217 		 *
218 		 * Old table:
219 		 * 	[ 0x1000, 0x4000] RAM		<-- element
220 		 * New table:
221 		 *	[ 0x1000, 0x4000] Reserved
222 		 */
223 		TAILQ_INSERT_BEFORE(element, new_element, chain);
224 		TAILQ_REMOVE(&e820_table, element, chain);
225 		free(element);
226 	} else if (base == element->base) {
227 		/*
228 		 * New element at system memory base boundary. Add new
229 		 * element before current and adjust the base of the old
230 		 * element.
231 		 *
232 		 * Old table:
233 		 * 	[ 0x1000, 0x4000] RAM		<-- element
234 		 * New table:
235 		 * 	[ 0x1000, 0x2000] Reserved
236 		 * 	[ 0x2000, 0x4000] RAM		<-- element
237 		 */
238 		TAILQ_INSERT_BEFORE(element, new_element, chain);
239 		element->base = end;
240 	} else if (end == element->end) {
241 		/*
242 		 * New element at system memory end boundary. Add new
243 		 * element after current and adjust the end of the
244 		 * current element.
245 		 *
246 		 * Old table:
247 		 * 	[ 0x1000, 0x4000] RAM		<-- element
248 		 * New table:
249 		 * 	[ 0x1000, 0x3000] RAM		<-- element
250 		 * 	[ 0x3000, 0x4000] Reserved
251 		 */
252 		TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain);
253 		element->end = base;
254 	} else {
255 		/*
256 		 * New element inside system memory entry. Split it by
257 		 * adding a system memory element and the new element
258 		 * before current.
259 		 *
260 		 * Old table:
261 		 * 	[ 0x1000, 0x4000] RAM		<-- element
262 		 * New table:
263 		 * 	[ 0x1000, 0x2000] RAM
264 		 * 	[ 0x2000, 0x3000] Reserved
265 		 * 	[ 0x3000, 0x4000] RAM		<-- element
266 		 */
267 		ram_element = e820_element_alloc(element->base, base,
268 		    E820_TYPE_MEMORY);
269 		if (ram_element == NULL) {
270 			return (ENOMEM);
271 		}
272 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
273 		TAILQ_INSERT_BEFORE(element, new_element, chain);
274 		element->base = end;
275 	}
276 
277 	/*
278 	 * If the previous element has the same type and ends at our base
279 	 * boundary, we can merge both entries.
280 	 */
281 	sib_element = TAILQ_PREV(new_element, e820_table, chain);
282 	if (sib_element != NULL &&
283 	    sib_element->type == new_element->type &&
284 	    sib_element->end == new_element->base) {
285 		new_element->base = sib_element->base;
286 		TAILQ_REMOVE(&e820_table, sib_element, chain);
287 		free(sib_element);
288 	}
289 
290 	/*
291 	 * If the next element has the same type and starts at our end
292 	 * boundary, we can merge both entries.
293 	 */
294 	sib_element = TAILQ_NEXT(new_element, chain);
295 	if (sib_element != NULL &&
296 	    sib_element->type == new_element->type &&
297 	    sib_element->base == new_element->end) {
298 		/* Merge new element into subsequent one. */
299 		new_element->end = sib_element->end;
300 		TAILQ_REMOVE(&e820_table, sib_element, chain);
301 		free(sib_element);
302 	}
303 
304 	return (0);
305 }
306 
307 static int
e820_add_memory_hole(const uint64_t base,const uint64_t end)308 e820_add_memory_hole(const uint64_t base, const uint64_t end)
309 {
310 	struct e820_element *element;
311 	struct e820_element *ram_element;
312 
313 	assert(end >= base);
314 
315 	/*
316 	 * E820 table should be always sorted in ascending order. Therefore,
317 	 * search for an element which end is larger than the base parameter.
318 	 */
319 	TAILQ_FOREACH(element, &e820_table, chain) {
320 		if (element->end > base) {
321 			break;
322 		}
323 	}
324 
325 	if (element == NULL || end <= element->base) {
326 		/* Nothing to do. Hole already exists */
327 		return (0);
328 	}
329 
330 	/* Memory holes are only allowed in system memory */
331 	assert(element->type == E820_TYPE_MEMORY);
332 
333 	if (base == element->base) {
334 		/*
335 		 * New hole at system memory base boundary.
336 		 *
337 		 * Old table:
338 		 * 	[ 0x1000, 0x4000] RAM
339 		 * New table:
340 		 * 	[ 0x2000, 0x4000] RAM
341 		 */
342 		element->base = end;
343 	} else if (end == element->end) {
344 		/*
345 		 * New hole at system memory end boundary.
346 		 *
347 		 * Old table:
348 		 * 	[ 0x1000, 0x4000] RAM
349 		 * New table:
350 		 * 	[ 0x1000, 0x3000] RAM
351 		 */
352 		element->end = base;
353 	} else {
354 		/*
355 		 * New hole inside system memory entry. Split the system memory.
356 		 *
357 		 * Old table:
358 		 * 	[ 0x1000, 0x4000] RAM		<-- element
359 		 * New table:
360 		 * 	[ 0x1000, 0x2000] RAM
361 		 * 	[ 0x3000, 0x4000] RAM		<-- element
362 		 */
363 		ram_element = e820_element_alloc(element->base, base,
364 		    E820_TYPE_MEMORY);
365 		if (ram_element == NULL) {
366 			return (ENOMEM);
367 		}
368 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
369 		element->base = end;
370 	}
371 
372 	return (0);
373 }
374 
375 static uint64_t
e820_alloc_highest(const uint64_t max_address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type)376 e820_alloc_highest(const uint64_t max_address, const uint64_t length,
377     const uint64_t alignment, const enum e820_memory_type type)
378 {
379 	struct e820_element *element;
380 
381 	TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) {
382 		uint64_t address, base, end;
383 
384 		end = MIN(max_address, element->end);
385 		base = roundup2(element->base, alignment);
386 
387 		/*
388 		 * If end - length == 0, we would allocate memory at address 0. This
389 		 * address is mostly unusable and we should avoid allocating it.
390 		 * Therefore, search for another block in that case.
391 		 */
392 		if (element->type != E820_TYPE_MEMORY || end < base ||
393 		    end - base < length || end - length == 0) {
394 			continue;
395 		}
396 
397 		address = rounddown2(end - length, alignment);
398 
399 		if (e820_add_entry(address, address + length, type) != 0) {
400 			return (0);
401 		}
402 
403 		return (address);
404 	}
405 
406 	return (0);
407 }
408 
409 static uint64_t
e820_alloc_lowest(const uint64_t min_address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type)410 e820_alloc_lowest(const uint64_t min_address, const uint64_t length,
411     const uint64_t alignment, const enum e820_memory_type type)
412 {
413 	struct e820_element *element;
414 
415 	TAILQ_FOREACH(element, &e820_table, chain) {
416 		uint64_t base, end;
417 
418 		end = element->end;
419 		base = MAX(min_address, roundup2(element->base, alignment));
420 
421 		/*
422 		 * If base == 0, we would allocate memory at address 0. This
423 		 * address is mostly unusable and we should avoid allocating it.
424 		 * Therefore, search for another block in that case.
425 		 */
426 		if (element->type != E820_TYPE_MEMORY || end < base ||
427 		    end - base < length || base == 0) {
428 			continue;
429 		}
430 
431 		if (e820_add_entry(base, base + length, type) != 0) {
432 			return (0);
433 		}
434 
435 		return (base);
436 	}
437 
438 	return (0);
439 }
440 
441 uint64_t
e820_alloc(const uint64_t address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type,const enum e820_allocation_strategy strategy)442 e820_alloc(const uint64_t address, const uint64_t length,
443     const uint64_t alignment, const enum e820_memory_type type,
444     const enum e820_allocation_strategy strategy)
445 {
446 	assert(powerof2(alignment));
447 	assert((address & (alignment - 1)) == 0);
448 
449 	switch (strategy) {
450 	case E820_ALLOCATE_ANY:
451 		/*
452 		 * Allocate any address. Therefore, ignore the address parameter
453 		 * and reuse the code path for allocating the lowest address.
454 		 */
455 		return (e820_alloc_lowest(0, length, alignment, type));
456 	case E820_ALLOCATE_LOWEST:
457 		return (e820_alloc_lowest(address, length, alignment, type));
458 	case E820_ALLOCATE_HIGHEST:
459 		return (e820_alloc_highest(address, length, alignment, type));
460 	case E820_ALLOCATE_SPECIFIC:
461 		if (e820_add_entry(address, address + length, type) != 0) {
462 			return (0);
463 		}
464 
465 		return (address);
466 	}
467 
468 	return (0);
469 }
470 
471 int
e820_init(struct vmctx * const ctx)472 e820_init(struct vmctx *const ctx)
473 {
474 	uint64_t lowmem_size, highmem_size;
475 	int error;
476 
477 	TAILQ_INIT(&e820_table);
478 
479 	lowmem_size = vm_get_lowmem_size(ctx);
480 	error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY);
481 	if (error) {
482 		warnx("%s: Could not add lowmem", __func__);
483 		return (error);
484 	}
485 
486 	highmem_size = vm_get_highmem_size(ctx);
487 	if (highmem_size != 0) {
488 		error = e820_add_entry(4 * GB, 4 * GB + highmem_size,
489 		    E820_TYPE_MEMORY);
490 		if (error) {
491 			warnx("%s: Could not add highmem", __func__);
492 			return (error);
493 		}
494 	}
495 
496 	error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END);
497 	if (error) {
498 		warnx("%s: Could not add VGA memory", __func__);
499 		return (error);
500 	}
501 
502 	error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END);
503 	if (error) {
504 		warnx("%s: Could not add ROM area", __func__);
505 		return (error);
506 	}
507 
508 	return (0);
509 }
510 
511 int
e820_finalize(void)512 e820_finalize(void)
513 {
514 	struct qemu_fwcfg_item *e820_fwcfg_item;
515 	int error;
516 
517 	e820_fwcfg_item = e820_get_fwcfg_item();
518 	if (e820_fwcfg_item == NULL) {
519 		warnx("invalid e820 table");
520 		return (ENOMEM);
521 	}
522 	error = qemu_fwcfg_add_file("etc/e820",
523 	    e820_fwcfg_item->size, e820_fwcfg_item->data);
524 	if (error != 0) {
525 		warnx("could not add qemu fwcfg etc/e820");
526 		free(e820_fwcfg_item->data);
527 		free(e820_fwcfg_item);
528 		return (error);
529 	}
530 	free(e820_fwcfg_item);
531 
532 	return (0);
533 }
534