xref: /freebsd/usr.sbin/bhyve/amd64/e820.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG
5  * Author: Corvin Köhne <c.koehne@beckhoff.com>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/queue.h>
10 
11 #include <machine/vmm.h>
12 
13 #include <assert.h>
14 #include <err.h>
15 #include <errno.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "debug.h"
21 #include "e820.h"
22 #include "qemu_fwcfg.h"
23 
24 /*
25  * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it
26  * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't
27  * hold all possible physical addresses and we can get into trouble.
28  */
29 static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t),
30     "Unable to represent physical memory by E820 table");
31 
32 #define E820_FWCFG_FILE_NAME "etc/e820"
33 
34 #define KB (1024UL)
35 #define MB (1024 * KB)
36 #define GB (1024 * MB)
37 
38 /*
39  * Fix E820 memory holes:
40  * [    A0000,    C0000) VGA
41  * [    C0000,   100000) ROM
42  */
43 #define E820_VGA_MEM_BASE 0xA0000
44 #define E820_VGA_MEM_END 0xC0000
45 #define E820_ROM_MEM_BASE 0xC0000
46 #define E820_ROM_MEM_END 0x100000
47 
48 struct e820_element {
49 	TAILQ_ENTRY(e820_element) chain;
50 	uint64_t base;
51 	uint64_t end;
52 	enum e820_memory_type type;
53 };
54 static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER(
55     e820_table);
56 
57 static struct e820_element *
58 e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type)
59 {
60 	struct e820_element *element;
61 
62 	element = calloc(1, sizeof(*element));
63 	if (element == NULL) {
64 		return (NULL);
65 	}
66 
67 	element->base = base;
68 	element->end = end;
69 	element->type = type;
70 
71 	return (element);
72 }
73 
74 static const char *
75 e820_get_type_name(const enum e820_memory_type type)
76 {
77 	switch (type) {
78 	case E820_TYPE_MEMORY:
79 		return ("RAM");
80 	case E820_TYPE_RESERVED:
81 		return ("Reserved");
82 	case E820_TYPE_ACPI:
83 		return ("ACPI");
84 	case E820_TYPE_NVS:
85 		return ("NVS");
86 	default:
87 		return ("Unknown");
88 	}
89 }
90 
91 void
92 e820_dump_table(void)
93 {
94 	struct e820_element *element;
95 	uint64_t i;
96 
97 	EPRINTLN("E820 map:");
98 
99 	i = 0;
100 	TAILQ_FOREACH(element, &e820_table, chain) {
101 		EPRINTLN("  (%4lu) [%16lx, %16lx] %s", i,
102 		    element->base, element->end,
103 		    e820_get_type_name(element->type));
104 
105 		++i;
106 	}
107 }
108 
109 static struct qemu_fwcfg_item *
110 e820_get_fwcfg_item(void)
111 {
112 	struct qemu_fwcfg_item *fwcfg_item;
113 	struct e820_element *element;
114 	struct e820_entry *entries;
115 	int count, i;
116 
117 	count = 0;
118 	TAILQ_FOREACH(element, &e820_table, chain) {
119 		++count;
120 	}
121 	if (count == 0) {
122 		warnx("%s: E820 table empty", __func__);
123 		return (NULL);
124 	}
125 
126 	fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item));
127 	if (fwcfg_item == NULL) {
128 		return (NULL);
129 	}
130 
131 	fwcfg_item->size = count * sizeof(struct e820_entry);
132 	fwcfg_item->data = calloc(count, sizeof(struct e820_entry));
133 	if (fwcfg_item->data == NULL) {
134 		free(fwcfg_item);
135 		return (NULL);
136 	}
137 
138 	i = 0;
139 	entries = (struct e820_entry *)fwcfg_item->data;
140 	TAILQ_FOREACH(element, &e820_table, chain) {
141 		struct e820_entry *entry = &entries[i];
142 
143 		entry->base = element->base;
144 		entry->length = element->end - element->base;
145 		entry->type = element->type;
146 
147 		++i;
148 	}
149 
150 	return (fwcfg_item);
151 }
152 
153 static int
154 e820_add_entry(const uint64_t base, const uint64_t end,
155     const enum e820_memory_type type)
156 {
157 	struct e820_element *new_element;
158 	struct e820_element *element;
159 	struct e820_element *ram_element;
160 
161 	assert(end >= base);
162 
163 	new_element = e820_element_alloc(base, end, type);
164 	if (new_element == NULL) {
165 		return (ENOMEM);
166 	}
167 
168 	/*
169 	 * E820 table should always be sorted in ascending order. Therefore,
170 	 * search for a range whose end is larger than the base parameter.
171 	 */
172 	TAILQ_FOREACH(element, &e820_table, chain) {
173 		if (element->end > base) {
174 			break;
175 		}
176 	}
177 
178 	/*
179 	 * System memory requires special handling.
180 	 */
181 	if (type == E820_TYPE_MEMORY) {
182 		/*
183 		 * base is larger than of any existing element. Add new system
184 		 * memory at the end of the table.
185 		 */
186 		if (element == NULL) {
187 			TAILQ_INSERT_TAIL(&e820_table, new_element, chain);
188 			return (0);
189 		}
190 
191 		/*
192 		 * System memory shouldn't overlap with any existing element.
193 		 */
194 		assert(end >= element->base);
195 
196 		TAILQ_INSERT_BEFORE(element, new_element, chain);
197 
198 		return (0);
199 	}
200 
201 	/*
202 	 * If some one tries to allocate a specific address, it could happen, that
203 	 * this address is not allocatable. Therefore, do some checks. If the
204 	 * address is not allocatable, don't panic. The user may have a fallback and
205 	 * tries to allocate another address. This is true for the GVT-d emulation
206 	 * which tries to reuse the host address of the graphics stolen memory and
207 	 * falls back to allocating the highest address below 4 GB.
208 	 */
209 	if (element == NULL || element->type != E820_TYPE_MEMORY ||
210 	    (base < element->base || end > element->end))
211 		return (ENOMEM);
212 
213 	if (base == element->base) {
214 		/*
215 		 * New element at system memory base boundary. Add new
216 		 * element before current and adjust the base of the old
217 		 * element.
218 		 *
219 		 * Old table:
220 		 * 	[ 0x1000, 0x4000] RAM		<-- element
221 		 * New table:
222 		 * 	[ 0x1000, 0x2000] Reserved
223 		 * 	[ 0x2000, 0x4000] RAM		<-- element
224 		 */
225 		TAILQ_INSERT_BEFORE(element, new_element, chain);
226 		element->base = end;
227 	} else if (end == element->end) {
228 		/*
229 		 * New element at system memory end boundary. Add new
230 		 * element after current and adjust the end of the
231 		 * current element.
232 		 *
233 		 * Old table:
234 		 * 	[ 0x1000, 0x4000] RAM		<-- element
235 		 * New table:
236 		 * 	[ 0x1000, 0x3000] RAM		<-- element
237 		 * 	[ 0x3000, 0x4000] Reserved
238 		 */
239 		TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain);
240 		element->end = base;
241 	} else {
242 		/*
243 		 * New element inside system memory entry. Split it by
244 		 * adding a system memory element and the new element
245 		 * before current.
246 		 *
247 		 * Old table:
248 		 * 	[ 0x1000, 0x4000] RAM		<-- element
249 		 * New table:
250 		 * 	[ 0x1000, 0x2000] RAM
251 		 * 	[ 0x2000, 0x3000] Reserved
252 		 * 	[ 0x3000, 0x4000] RAM		<-- element
253 		 */
254 		ram_element = e820_element_alloc(element->base, base,
255 		    E820_TYPE_MEMORY);
256 		if (ram_element == NULL) {
257 			return (ENOMEM);
258 		}
259 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
260 		TAILQ_INSERT_BEFORE(element, new_element, chain);
261 		element->base = end;
262 	}
263 
264 	return (0);
265 }
266 
267 static int
268 e820_add_memory_hole(const uint64_t base, const uint64_t end)
269 {
270 	struct e820_element *element;
271 	struct e820_element *ram_element;
272 
273 	assert(end >= base);
274 
275 	/*
276 	 * E820 table should be always sorted in ascending order. Therefore,
277 	 * search for an element which end is larger than the base parameter.
278 	 */
279 	TAILQ_FOREACH(element, &e820_table, chain) {
280 		if (element->end > base) {
281 			break;
282 		}
283 	}
284 
285 	if (element == NULL || end <= element->base) {
286 		/* Nothing to do. Hole already exists */
287 		return (0);
288 	}
289 
290 	/* Memory holes are only allowed in system memory */
291 	assert(element->type == E820_TYPE_MEMORY);
292 
293 	if (base == element->base) {
294 		/*
295 		 * New hole at system memory base boundary.
296 		 *
297 		 * Old table:
298 		 * 	[ 0x1000, 0x4000] RAM
299 		 * New table:
300 		 * 	[ 0x2000, 0x4000] RAM
301 		 */
302 		element->base = end;
303 	} else if (end == element->end) {
304 		/*
305 		 * New hole at system memory end boundary.
306 		 *
307 		 * Old table:
308 		 * 	[ 0x1000, 0x4000] RAM
309 		 * New table:
310 		 * 	[ 0x1000, 0x3000] RAM
311 		 */
312 		element->end = base;
313 	} else {
314 		/*
315 		 * New hole inside system memory entry. Split the system memory.
316 		 *
317 		 * Old table:
318 		 * 	[ 0x1000, 0x4000] RAM		<-- element
319 		 * New table:
320 		 * 	[ 0x1000, 0x2000] RAM
321 		 * 	[ 0x3000, 0x4000] RAM		<-- element
322 		 */
323 		ram_element = e820_element_alloc(element->base, base,
324 		    E820_TYPE_MEMORY);
325 		if (ram_element == NULL) {
326 			return (ENOMEM);
327 		}
328 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
329 		element->base = end;
330 	}
331 
332 	return (0);
333 }
334 
335 static uint64_t
336 e820_alloc_highest(const uint64_t max_address, const uint64_t length,
337     const uint64_t alignment, const enum e820_memory_type type)
338 {
339 	struct e820_element *element;
340 
341 	TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) {
342 		uint64_t address, base, end;
343 
344 		end = MIN(max_address, element->end);
345 		base = roundup2(element->base, alignment);
346 
347 		/*
348 		 * If end - length == 0, we would allocate memory at address 0. This
349 		 * address is mostly unusable and we should avoid allocating it.
350 		 * Therefore, search for another block in that case.
351 		 */
352 		if (element->type != E820_TYPE_MEMORY || end < base ||
353 		    end - base < length || end - length == 0) {
354 			continue;
355 		}
356 
357 		address = rounddown2(end - length, alignment);
358 
359 		if (e820_add_entry(address, address + length, type) != 0) {
360 			return (0);
361 		}
362 
363 		return (address);
364 	}
365 
366 	return (0);
367 }
368 
369 static uint64_t
370 e820_alloc_lowest(const uint64_t min_address, const uint64_t length,
371     const uint64_t alignment, const enum e820_memory_type type)
372 {
373 	struct e820_element *element;
374 
375 	TAILQ_FOREACH(element, &e820_table, chain) {
376 		uint64_t base, end;
377 
378 		end = element->end;
379 		base = MAX(min_address, roundup2(element->base, alignment));
380 
381 		/*
382 		 * If base == 0, we would allocate memory at address 0. This
383 		 * address is mostly unusable and we should avoid allocating it.
384 		 * Therefore, search for another block in that case.
385 		 */
386 		if (element->type != E820_TYPE_MEMORY || end < base ||
387 		    end - base < length || base == 0) {
388 			continue;
389 		}
390 
391 		if (e820_add_entry(base, base + length, type) != 0) {
392 			return (0);
393 		}
394 
395 		return (base);
396 	}
397 
398 	return (0);
399 }
400 
401 uint64_t
402 e820_alloc(const uint64_t address, const uint64_t length,
403     const uint64_t alignment, const enum e820_memory_type type,
404     const enum e820_allocation_strategy strategy)
405 {
406 	assert(powerof2(alignment));
407 	assert((address & (alignment - 1)) == 0);
408 
409 	switch (strategy) {
410 	case E820_ALLOCATE_ANY:
411 		/*
412 		 * Allocate any address. Therefore, ignore the address parameter
413 		 * and reuse the code path for allocating the lowest address.
414 		 */
415 		return (e820_alloc_lowest(0, length, alignment, type));
416 	case E820_ALLOCATE_LOWEST:
417 		return (e820_alloc_lowest(address, length, alignment, type));
418 	case E820_ALLOCATE_HIGHEST:
419 		return (e820_alloc_highest(address, length, alignment, type));
420 	case E820_ALLOCATE_SPECIFIC:
421 		if (e820_add_entry(address, address + length, type) != 0) {
422 			return (0);
423 		}
424 
425 		return (address);
426 	}
427 
428 	return (0);
429 }
430 
431 int
432 e820_init(struct vmctx *const ctx)
433 {
434 	uint64_t lowmem_size, highmem_size;
435 	int error;
436 
437 	TAILQ_INIT(&e820_table);
438 
439 	lowmem_size = vm_get_lowmem_size(ctx);
440 	error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY);
441 	if (error) {
442 		warnx("%s: Could not add lowmem", __func__);
443 		return (error);
444 	}
445 
446 	highmem_size = vm_get_highmem_size(ctx);
447 	if (highmem_size != 0) {
448 		error = e820_add_entry(4 * GB, 4 * GB + highmem_size,
449 		    E820_TYPE_MEMORY);
450 		if (error) {
451 			warnx("%s: Could not add highmem", __func__);
452 			return (error);
453 		}
454 	}
455 
456 	error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END);
457 	if (error) {
458 		warnx("%s: Could not add VGA memory", __func__);
459 		return (error);
460 	}
461 
462 	error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END);
463 	if (error) {
464 		warnx("%s: Could not add ROM area", __func__);
465 		return (error);
466 	}
467 
468 	return (0);
469 }
470 
471 int
472 e820_finalize(void)
473 {
474 	struct qemu_fwcfg_item *e820_fwcfg_item;
475 	int error;
476 
477 	e820_fwcfg_item = e820_get_fwcfg_item();
478 	if (e820_fwcfg_item == NULL) {
479 		warnx("invalid e820 table");
480 		return (ENOMEM);
481 	}
482 	error = qemu_fwcfg_add_file("etc/e820",
483 	    e820_fwcfg_item->size, e820_fwcfg_item->data);
484 	if (error != 0) {
485 		warnx("could not add qemu fwcfg etc/e820");
486 		free(e820_fwcfg_item->data);
487 		free(e820_fwcfg_item);
488 		return (error);
489 	}
490 	free(e820_fwcfg_item);
491 
492 	return (0);
493 }
494