xref: /freebsd/usr.sbin/bhyve/amd64/e820.c (revision 2e620256bd76c449c835c604e404483437743011)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG
5  * Author: Corvin Köhne <c.koehne@beckhoff.com>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/queue.h>
10 
11 #include <machine/vmm.h>
12 
13 #include <assert.h>
14 #include <err.h>
15 #include <errno.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "e820.h"
21 #include "qemu_fwcfg.h"
22 
23 /*
24  * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it
25  * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't
26  * hold all possible physical addresses and we can get into trouble.
27  */
28 static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t),
29     "Unable to represent physical memory by E820 table");
30 
31 #define E820_FWCFG_FILE_NAME "etc/e820"
32 
33 #define KB (1024UL)
34 #define MB (1024 * KB)
35 #define GB (1024 * MB)
36 
37 /*
38  * Fix E820 memory holes:
39  * [    A0000,    C0000) VGA
40  * [    C0000,   100000) ROM
41  */
42 #define E820_VGA_MEM_BASE 0xA0000
43 #define E820_VGA_MEM_END 0xC0000
44 #define E820_ROM_MEM_BASE 0xC0000
45 #define E820_ROM_MEM_END 0x100000
46 
47 struct e820_element {
48 	TAILQ_ENTRY(e820_element) chain;
49 	uint64_t base;
50 	uint64_t end;
51 	enum e820_memory_type type;
52 };
53 static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER(
54     e820_table);
55 
56 static struct e820_element *
57 e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type)
58 {
59 	struct e820_element *element;
60 
61 	element = calloc(1, sizeof(*element));
62 	if (element == NULL) {
63 		return (NULL);
64 	}
65 
66 	element->base = base;
67 	element->end = end;
68 	element->type = type;
69 
70 	return (element);
71 }
72 
73 static const char *
74 e820_get_type_name(const enum e820_memory_type type)
75 {
76 	switch (type) {
77 	case E820_TYPE_MEMORY:
78 		return ("RAM");
79 	case E820_TYPE_RESERVED:
80 		return ("Reserved");
81 	case E820_TYPE_ACPI:
82 		return ("ACPI");
83 	case E820_TYPE_NVS:
84 		return ("NVS");
85 	default:
86 		return ("Unknown");
87 	}
88 }
89 
90 void
91 e820_dump_table(void)
92 {
93 	struct e820_element *element;
94 	uint64_t i;
95 
96 	fprintf(stderr, "E820 map:\n");
97 
98 	i = 0;
99 	TAILQ_FOREACH(element, &e820_table, chain) {
100 		fprintf(stderr, "  (%4lu) [%16lx, %16lx] %s\n", i,
101 		    element->base, element->end,
102 		    e820_get_type_name(element->type));
103 
104 		++i;
105 	}
106 }
107 
108 static struct qemu_fwcfg_item *
109 e820_get_fwcfg_item(void)
110 {
111 	struct qemu_fwcfg_item *fwcfg_item;
112 	struct e820_element *element;
113 	struct e820_entry *entries;
114 	int count, i;
115 
116 	count = 0;
117 	TAILQ_FOREACH(element, &e820_table, chain) {
118 		++count;
119 	}
120 	if (count == 0) {
121 		warnx("%s: E820 table empty", __func__);
122 		return (NULL);
123 	}
124 
125 	fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item));
126 	if (fwcfg_item == NULL) {
127 		return (NULL);
128 	}
129 
130 	fwcfg_item->size = count * sizeof(struct e820_entry);
131 	fwcfg_item->data = calloc(count, sizeof(struct e820_entry));
132 	if (fwcfg_item->data == NULL) {
133 		free(fwcfg_item);
134 		return (NULL);
135 	}
136 
137 	i = 0;
138 	entries = (struct e820_entry *)fwcfg_item->data;
139 	TAILQ_FOREACH(element, &e820_table, chain) {
140 		struct e820_entry *entry = &entries[i];
141 
142 		entry->base = element->base;
143 		entry->length = element->end - element->base;
144 		entry->type = element->type;
145 
146 		++i;
147 	}
148 
149 	return (fwcfg_item);
150 }
151 
152 static int
153 e820_add_entry(const uint64_t base, const uint64_t end,
154     const enum e820_memory_type type)
155 {
156 	struct e820_element *new_element;
157 	struct e820_element *element;
158 	struct e820_element *ram_element;
159 
160 	assert(end >= base);
161 
162 	new_element = e820_element_alloc(base, end, type);
163 	if (new_element == NULL) {
164 		return (ENOMEM);
165 	}
166 
167 	/*
168 	 * E820 table should always be sorted in ascending order. Therefore,
169 	 * search for a range whose end is larger than the base parameter.
170 	 */
171 	TAILQ_FOREACH(element, &e820_table, chain) {
172 		if (element->end > base) {
173 			break;
174 		}
175 	}
176 
177 	/*
178 	 * System memory requires special handling.
179 	 */
180 	if (type == E820_TYPE_MEMORY) {
181 		/*
182 		 * base is larger than of any existing element. Add new system
183 		 * memory at the end of the table.
184 		 */
185 		if (element == NULL) {
186 			TAILQ_INSERT_TAIL(&e820_table, new_element, chain);
187 			return (0);
188 		}
189 
190 		/*
191 		 * System memory shouldn't overlap with any existing element.
192 		 */
193 		assert(end >= element->base);
194 
195 		TAILQ_INSERT_BEFORE(element, new_element, chain);
196 
197 		return (0);
198 	}
199 
200 	/*
201 	 * If some one tries to allocate a specific address, it could happen, that
202 	 * this address is not allocatable. Therefore, do some checks. If the
203 	 * address is not allocatable, don't panic. The user may have a fallback and
204 	 * tries to allocate another address. This is true for the GVT-d emulation
205 	 * which tries to reuse the host address of the graphics stolen memory and
206 	 * falls back to allocating the highest address below 4 GB.
207 	 */
208 	if (element == NULL || element->type != E820_TYPE_MEMORY ||
209 	    (base < element->base || end > element->end))
210 		return (ENOMEM);
211 
212 	if (base == element->base) {
213 		/*
214 		 * New element at system memory base boundary. Add new
215 		 * element before current and adjust the base of the old
216 		 * element.
217 		 *
218 		 * Old table:
219 		 * 	[ 0x1000, 0x4000] RAM		<-- element
220 		 * New table:
221 		 * 	[ 0x1000, 0x2000] Reserved
222 		 * 	[ 0x2000, 0x4000] RAM		<-- element
223 		 */
224 		TAILQ_INSERT_BEFORE(element, new_element, chain);
225 		element->base = end;
226 	} else if (end == element->end) {
227 		/*
228 		 * New element at system memory end boundary. Add new
229 		 * element after current and adjust the end of the
230 		 * current element.
231 		 *
232 		 * Old table:
233 		 * 	[ 0x1000, 0x4000] RAM		<-- element
234 		 * New table:
235 		 * 	[ 0x1000, 0x3000] RAM		<-- element
236 		 * 	[ 0x3000, 0x4000] Reserved
237 		 */
238 		TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain);
239 		element->end = base;
240 	} else {
241 		/*
242 		 * New element inside system memory entry. Split it by
243 		 * adding a system memory element and the new element
244 		 * before current.
245 		 *
246 		 * Old table:
247 		 * 	[ 0x1000, 0x4000] RAM		<-- element
248 		 * New table:
249 		 * 	[ 0x1000, 0x2000] RAM
250 		 * 	[ 0x2000, 0x3000] Reserved
251 		 * 	[ 0x3000, 0x4000] RAM		<-- element
252 		 */
253 		ram_element = e820_element_alloc(element->base, base,
254 		    E820_TYPE_MEMORY);
255 		if (ram_element == NULL) {
256 			return (ENOMEM);
257 		}
258 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
259 		TAILQ_INSERT_BEFORE(element, new_element, chain);
260 		element->base = end;
261 	}
262 
263 	return (0);
264 }
265 
266 static int
267 e820_add_memory_hole(const uint64_t base, const uint64_t end)
268 {
269 	struct e820_element *element;
270 	struct e820_element *ram_element;
271 
272 	assert(end >= base);
273 
274 	/*
275 	 * E820 table should be always sorted in ascending order. Therefore,
276 	 * search for an element which end is larger than the base parameter.
277 	 */
278 	TAILQ_FOREACH(element, &e820_table, chain) {
279 		if (element->end > base) {
280 			break;
281 		}
282 	}
283 
284 	if (element == NULL || end <= element->base) {
285 		/* Nothing to do. Hole already exists */
286 		return (0);
287 	}
288 
289 	/* Memory holes are only allowed in system memory */
290 	assert(element->type == E820_TYPE_MEMORY);
291 
292 	if (base == element->base) {
293 		/*
294 		 * New hole at system memory base boundary.
295 		 *
296 		 * Old table:
297 		 * 	[ 0x1000, 0x4000] RAM
298 		 * New table:
299 		 * 	[ 0x2000, 0x4000] RAM
300 		 */
301 		element->base = end;
302 	} else if (end == element->end) {
303 		/*
304 		 * New hole at system memory end boundary.
305 		 *
306 		 * Old table:
307 		 * 	[ 0x1000, 0x4000] RAM
308 		 * New table:
309 		 * 	[ 0x1000, 0x3000] RAM
310 		 */
311 		element->end = base;
312 	} else {
313 		/*
314 		 * New hole inside system memory entry. Split the system memory.
315 		 *
316 		 * Old table:
317 		 * 	[ 0x1000, 0x4000] RAM		<-- element
318 		 * New table:
319 		 * 	[ 0x1000, 0x2000] RAM
320 		 * 	[ 0x3000, 0x4000] RAM		<-- element
321 		 */
322 		ram_element = e820_element_alloc(element->base, base,
323 		    E820_TYPE_MEMORY);
324 		if (ram_element == NULL) {
325 			return (ENOMEM);
326 		}
327 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
328 		element->base = end;
329 	}
330 
331 	return (0);
332 }
333 
334 static uint64_t
335 e820_alloc_highest(const uint64_t max_address, const uint64_t length,
336     const uint64_t alignment, const enum e820_memory_type type)
337 {
338 	struct e820_element *element;
339 
340 	TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) {
341 		uint64_t address, base, end;
342 
343 		end = MIN(max_address, element->end);
344 		base = roundup2(element->base, alignment);
345 
346 		/*
347 		 * If end - length == 0, we would allocate memory at address 0. This
348 		 * address is mostly unusable and we should avoid allocating it.
349 		 * Therefore, search for another block in that case.
350 		 */
351 		if (element->type != E820_TYPE_MEMORY || end < base ||
352 		    end - base < length || end - length == 0) {
353 			continue;
354 		}
355 
356 		address = rounddown2(end - length, alignment);
357 
358 		if (e820_add_entry(address, address + length, type) != 0) {
359 			return (0);
360 		}
361 
362 		return (address);
363 	}
364 
365 	return (0);
366 }
367 
368 static uint64_t
369 e820_alloc_lowest(const uint64_t min_address, const uint64_t length,
370     const uint64_t alignment, const enum e820_memory_type type)
371 {
372 	struct e820_element *element;
373 
374 	TAILQ_FOREACH(element, &e820_table, chain) {
375 		uint64_t base, end;
376 
377 		end = element->end;
378 		base = MAX(min_address, roundup2(element->base, alignment));
379 
380 		/*
381 		 * If base == 0, we would allocate memory at address 0. This
382 		 * address is mostly unusable and we should avoid allocating it.
383 		 * Therefore, search for another block in that case.
384 		 */
385 		if (element->type != E820_TYPE_MEMORY || end < base ||
386 		    end - base < length || base == 0) {
387 			continue;
388 		}
389 
390 		if (e820_add_entry(base, base + length, type) != 0) {
391 			return (0);
392 		}
393 
394 		return (base);
395 	}
396 
397 	return (0);
398 }
399 
400 uint64_t
401 e820_alloc(const uint64_t address, const uint64_t length,
402     const uint64_t alignment, const enum e820_memory_type type,
403     const enum e820_allocation_strategy strategy)
404 {
405 	assert(powerof2(alignment));
406 	assert((address & (alignment - 1)) == 0);
407 
408 	switch (strategy) {
409 	case E820_ALLOCATE_ANY:
410 		/*
411 		 * Allocate any address. Therefore, ignore the address parameter
412 		 * and reuse the code path for allocating the lowest address.
413 		 */
414 		return (e820_alloc_lowest(0, length, alignment, type));
415 	case E820_ALLOCATE_LOWEST:
416 		return (e820_alloc_lowest(address, length, alignment, type));
417 	case E820_ALLOCATE_HIGHEST:
418 		return (e820_alloc_highest(address, length, alignment, type));
419 	case E820_ALLOCATE_SPECIFIC:
420 		if (e820_add_entry(address, address + length, type) != 0) {
421 			return (0);
422 		}
423 
424 		return (address);
425 	}
426 
427 	return (0);
428 }
429 
430 int
431 e820_init(struct vmctx *const ctx)
432 {
433 	uint64_t lowmem_size, highmem_size;
434 	int error;
435 
436 	TAILQ_INIT(&e820_table);
437 
438 	lowmem_size = vm_get_lowmem_size(ctx);
439 	error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY);
440 	if (error) {
441 		warnx("%s: Could not add lowmem", __func__);
442 		return (error);
443 	}
444 
445 	highmem_size = vm_get_highmem_size(ctx);
446 	if (highmem_size != 0) {
447 		error = e820_add_entry(4 * GB, 4 * GB + highmem_size,
448 		    E820_TYPE_MEMORY);
449 		if (error) {
450 			warnx("%s: Could not add highmem", __func__);
451 			return (error);
452 		}
453 	}
454 
455 	error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END);
456 	if (error) {
457 		warnx("%s: Could not add VGA memory", __func__);
458 		return (error);
459 	}
460 
461 	error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END);
462 	if (error) {
463 		warnx("%s: Could not add ROM area", __func__);
464 		return (error);
465 	}
466 
467 	return (0);
468 }
469 
470 int
471 e820_finalize(void)
472 {
473 	struct qemu_fwcfg_item *e820_fwcfg_item;
474 	int error;
475 
476 	e820_fwcfg_item = e820_get_fwcfg_item();
477 	if (e820_fwcfg_item == NULL) {
478 		warnx("invalid e820 table");
479 		return (ENOMEM);
480 	}
481 	error = qemu_fwcfg_add_file("etc/e820",
482 	    e820_fwcfg_item->size, e820_fwcfg_item->data);
483 	if (error != 0) {
484 		warnx("could not add qemu fwcfg etc/e820");
485 		return (error);
486 	}
487 	free(e820_fwcfg_item);
488 
489 	return (0);
490 }
491