xref: /illumos-gate/usr/src/cmd/bhyve/amd64/e820.c (revision b1529121add3ff25fc7b58196363a5439c0f6b67)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG
5  * Author: Corvin Köhne <c.koehne@beckhoff.com>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/queue.h>
10 
11 #include <machine/vmm.h>
12 
13 #include <assert.h>
14 #include <err.h>
15 #include <errno.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "debug.h"
21 #include "e820.h"
22 #include "qemu_fwcfg.h"
23 
24 /*
25  * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it
26  * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't
27  * hold all possible physical addresses and we can get into trouble.
28  */
29 static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t),
30     "Unable to represent physical memory by E820 table");
31 
32 #define E820_FWCFG_FILE_NAME "etc/e820"
33 
34 #define KB (1024UL)
35 #define MB (1024 * KB)
36 #define GB (1024 * MB)
37 
38 /*
39  * Fix E820 memory holes:
40  * [    A0000,    C0000) VGA
41  * [    C0000,   100000) ROM
42  */
43 #define E820_VGA_MEM_BASE 0xA0000
44 #define E820_VGA_MEM_END 0xC0000
45 #define E820_ROM_MEM_BASE 0xC0000
46 #define E820_ROM_MEM_END 0x100000
47 
48 struct e820_element {
49 	TAILQ_ENTRY(e820_element) chain;
50 	uint64_t base;
51 	uint64_t end;
52 	enum e820_memory_type type;
53 };
54 static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER(
55     e820_table);
56 
57 static struct e820_element *
58 e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type)
59 {
60 	struct e820_element *element;
61 
62 	element = calloc(1, sizeof(*element));
63 	if (element == NULL) {
64 		return (NULL);
65 	}
66 
67 	element->base = base;
68 	element->end = end;
69 	element->type = type;
70 
71 	return (element);
72 }
73 
74 static const char *
75 e820_get_type_name(const enum e820_memory_type type)
76 {
77 	switch (type) {
78 	case E820_TYPE_MEMORY:
79 		return ("RAM");
80 	case E820_TYPE_RESERVED:
81 		return ("Reserved");
82 	case E820_TYPE_ACPI:
83 		return ("ACPI");
84 	case E820_TYPE_NVS:
85 		return ("NVS");
86 	default:
87 		return ("Unknown");
88 	}
89 }
90 
91 void
92 e820_dump_table(void)
93 {
94 	struct e820_element *element;
95 	uint64_t i;
96 
97 	EPRINTLN("E820 map:");
98 
99 	i = 0;
100 	TAILQ_FOREACH(element, &e820_table, chain) {
101 		EPRINTLN("  (%4lu) [%16lx, %16lx] %s", i,
102 		    element->base, element->end,
103 		    e820_get_type_name(element->type));
104 
105 		++i;
106 	}
107 }
108 
109 static struct qemu_fwcfg_item *
110 e820_get_fwcfg_item(void)
111 {
112 	struct qemu_fwcfg_item *fwcfg_item;
113 	struct e820_element *element;
114 	struct e820_entry *entries;
115 	int count, i;
116 
117 	count = 0;
118 	TAILQ_FOREACH(element, &e820_table, chain) {
119 		++count;
120 	}
121 	if (count == 0) {
122 		warnx("%s: E820 table empty", __func__);
123 		return (NULL);
124 	}
125 
126 	fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item));
127 	if (fwcfg_item == NULL) {
128 		return (NULL);
129 	}
130 
131 	fwcfg_item->size = count * sizeof(struct e820_entry);
132 	fwcfg_item->data = calloc(count, sizeof(struct e820_entry));
133 	if (fwcfg_item->data == NULL) {
134 		free(fwcfg_item);
135 		return (NULL);
136 	}
137 
138 	i = 0;
139 	entries = (struct e820_entry *)fwcfg_item->data;
140 	TAILQ_FOREACH(element, &e820_table, chain) {
141 		struct e820_entry *entry = &entries[i];
142 
143 		entry->base = element->base;
144 		entry->length = element->end - element->base;
145 		entry->type = element->type;
146 
147 		++i;
148 	}
149 
150 	return (fwcfg_item);
151 }
152 
153 static int
154 e820_add_entry(const uint64_t base, const uint64_t end,
155     const enum e820_memory_type type)
156 {
157 	struct e820_element *new_element;
158 	struct e820_element *element;
159 	struct e820_element *ram_element;
160 
161 	assert(end >= base);
162 
163 	new_element = e820_element_alloc(base, end, type);
164 	if (new_element == NULL) {
165 		return (ENOMEM);
166 	}
167 
168 	/*
169 	 * E820 table should always be sorted in ascending order. Therefore,
170 	 * search for a range whose end is larger than the base parameter.
171 	 */
172 	TAILQ_FOREACH(element, &e820_table, chain) {
173 		if (element->end > base) {
174 			break;
175 		}
176 	}
177 
178 	/*
179 	 * System memory requires special handling.
180 	 */
181 	if (type == E820_TYPE_MEMORY) {
182 		/*
183 		 * base is larger than of any existing element. Add new system
184 		 * memory at the end of the table.
185 		 */
186 		if (element == NULL) {
187 			TAILQ_INSERT_TAIL(&e820_table, new_element, chain);
188 			return (0);
189 		}
190 
191 		/*
192 		 * System memory shouldn't overlap with any existing element.
193 		 */
194 		assert(end >= element->base);
195 
196 		TAILQ_INSERT_BEFORE(element, new_element, chain);
197 
198 		return (0);
199 	}
200 
201 	assert(element != NULL);
202 	/* Non system memory should be allocated inside system memory. */
203 	assert(element->type == E820_TYPE_MEMORY);
204 	/* New element should fit into existing system memory element. */
205 	assert(base >= element->base && end <= element->end);
206 	if (base == element->base && end == element->end) {
207 		/*
208 		 * The new entry replaces an existing one.
209 		 *
210 		 * Old table:
211 		 *      [ 0x1000, 0x4000] RAM           <-- element
212 		 * New table:
213 		 *      [ 0x1000, 0x4000] Reserved
214 		 */
215 		TAILQ_INSERT_BEFORE(element, new_element, chain);
216 		TAILQ_REMOVE(&e820_table, element, chain);
217 		free(element);
218 	} else if (base == element->base) {
219 		/*
220 		 * New element at system memory base boundary. Add new
221 		 * element before current and adjust the base of the old
222 		 * element.
223 		 *
224 		 * Old table:
225 		 * 	[ 0x1000, 0x4000] RAM		<-- element
226 		 * New table:
227 		 * 	[ 0x1000, 0x2000] Reserved
228 		 * 	[ 0x2000, 0x4000] RAM		<-- element
229 		 */
230 		TAILQ_INSERT_BEFORE(element, new_element, chain);
231 		element->base = end;
232 	} else if (end == element->end) {
233 		/*
234 		 * New element at system memory end boundary. Add new
235 		 * element after current and adjust the end of the
236 		 * current element.
237 		 *
238 		 * Old table:
239 		 * 	[ 0x1000, 0x4000] RAM		<-- element
240 		 * New table:
241 		 * 	[ 0x1000, 0x3000] RAM		<-- element
242 		 * 	[ 0x3000, 0x4000] Reserved
243 		 */
244 		TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain);
245 		element->end = base;
246 	} else {
247 		/*
248 		 * New element inside system memory entry. Split it by
249 		 * adding a system memory element and the new element
250 		 * before current.
251 		 *
252 		 * Old table:
253 		 * 	[ 0x1000, 0x4000] RAM		<-- element
254 		 * New table:
255 		 * 	[ 0x1000, 0x2000] RAM
256 		 * 	[ 0x2000, 0x3000] Reserved
257 		 * 	[ 0x3000, 0x4000] RAM		<-- element
258 		 */
259 		ram_element = e820_element_alloc(element->base, base,
260 		    E820_TYPE_MEMORY);
261 		if (ram_element == NULL) {
262 			return (ENOMEM);
263 		}
264 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
265 		TAILQ_INSERT_BEFORE(element, new_element, chain);
266 		element->base = end;
267 	}
268 
269 	return (0);
270 }
271 
272 static int
273 e820_add_memory_hole(const uint64_t base, const uint64_t end)
274 {
275 	struct e820_element *element;
276 	struct e820_element *ram_element;
277 
278 	assert(end >= base);
279 
280 	/*
281 	 * E820 table should be always sorted in ascending order. Therefore,
282 	 * search for an element which end is larger than the base parameter.
283 	 */
284 	TAILQ_FOREACH(element, &e820_table, chain) {
285 		if (element->end > base) {
286 			break;
287 		}
288 	}
289 
290 	if (element == NULL || end <= element->base) {
291 		/* Nothing to do. Hole already exists */
292 		return (0);
293 	}
294 
295 	/* Memory holes are only allowed in system memory */
296 	assert(element->type == E820_TYPE_MEMORY);
297 
298 	if (base == element->base) {
299 		/*
300 		 * New hole at system memory base boundary.
301 		 *
302 		 * Old table:
303 		 * 	[ 0x1000, 0x4000] RAM
304 		 * New table:
305 		 * 	[ 0x2000, 0x4000] RAM
306 		 */
307 		element->base = end;
308 	} else if (end == element->end) {
309 		/*
310 		 * New hole at system memory end boundary.
311 		 *
312 		 * Old table:
313 		 * 	[ 0x1000, 0x4000] RAM
314 		 * New table:
315 		 * 	[ 0x1000, 0x3000] RAM
316 		 */
317 		element->end = base;
318 	} else {
319 		/*
320 		 * New hole inside system memory entry. Split the system memory.
321 		 *
322 		 * Old table:
323 		 * 	[ 0x1000, 0x4000] RAM		<-- element
324 		 * New table:
325 		 * 	[ 0x1000, 0x2000] RAM
326 		 * 	[ 0x3000, 0x4000] RAM		<-- element
327 		 */
328 		ram_element = e820_element_alloc(element->base, base,
329 		    E820_TYPE_MEMORY);
330 		if (ram_element == NULL) {
331 			return (ENOMEM);
332 		}
333 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
334 		element->base = end;
335 	}
336 
337 	return (0);
338 }
339 
340 static uint64_t
341 e820_alloc_highest(const uint64_t max_address, const uint64_t length,
342     const uint64_t alignment, const enum e820_memory_type type)
343 {
344 	struct e820_element *element;
345 
346 	TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) {
347 		uint64_t address, base, end;
348 
349 		end = MIN(max_address, element->end);
350 		base = roundup2(element->base, alignment);
351 
352 		/*
353 		 * If end - length == 0, we would allocate memory at address 0. This
354 		 * address is mostly unusable and we should avoid allocating it.
355 		 * Therefore, search for another block in that case.
356 		 */
357 		if (element->type != E820_TYPE_MEMORY || end < base ||
358 		    end - base < length || end - length == 0) {
359 			continue;
360 		}
361 
362 		address = rounddown2(end - length, alignment);
363 
364 		if (e820_add_entry(address, address + length, type) != 0) {
365 			return (0);
366 		}
367 
368 		return (address);
369 	}
370 
371 	return (0);
372 }
373 
374 static uint64_t
375 e820_alloc_lowest(const uint64_t min_address, const uint64_t length,
376     const uint64_t alignment, const enum e820_memory_type type)
377 {
378 	struct e820_element *element;
379 
380 	TAILQ_FOREACH(element, &e820_table, chain) {
381 		uint64_t base, end;
382 
383 		end = element->end;
384 		base = MAX(min_address, roundup2(element->base, alignment));
385 
386 		/*
387 		 * If base == 0, we would allocate memory at address 0. This
388 		 * address is mostly unusable and we should avoid allocating it.
389 		 * Therefore, search for another block in that case.
390 		 */
391 		if (element->type != E820_TYPE_MEMORY || end < base ||
392 		    end - base < length || base == 0) {
393 			continue;
394 		}
395 
396 		if (e820_add_entry(base, base + length, type) != 0) {
397 			return (0);
398 		}
399 
400 		return (base);
401 	}
402 
403 	return (0);
404 }
405 
406 uint64_t
407 e820_alloc(const uint64_t address, const uint64_t length,
408     const uint64_t alignment, const enum e820_memory_type type,
409     const enum e820_allocation_strategy strategy)
410 {
411 	assert(powerof2(alignment));
412 	assert((address & (alignment - 1)) == 0);
413 
414 	switch (strategy) {
415 	case E820_ALLOCATE_ANY:
416 		/*
417 		 * Allocate any address. Therefore, ignore the address parameter
418 		 * and reuse the code path for allocating the lowest address.
419 		 */
420 		return (e820_alloc_lowest(0, length, alignment, type));
421 	case E820_ALLOCATE_LOWEST:
422 		return (e820_alloc_lowest(address, length, alignment, type));
423 	case E820_ALLOCATE_HIGHEST:
424 		return (e820_alloc_highest(address, length, alignment, type));
425 	case E820_ALLOCATE_SPECIFIC:
426 		if (e820_add_entry(address, address + length, type) != 0) {
427 			return (0);
428 		}
429 
430 		return (address);
431 	}
432 
433 	return (0);
434 }
435 
436 int
437 e820_init(struct vmctx *const ctx)
438 {
439 	uint64_t lowmem_size, highmem_size;
440 	int error;
441 
442 	TAILQ_INIT(&e820_table);
443 
444 	lowmem_size = vm_get_lowmem_size(ctx);
445 	error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY);
446 	if (error) {
447 		warnx("%s: Could not add lowmem", __func__);
448 		return (error);
449 	}
450 
451 	highmem_size = vm_get_highmem_size(ctx);
452 	if (highmem_size != 0) {
453 		error = e820_add_entry(4 * GB, 4 * GB + highmem_size,
454 		    E820_TYPE_MEMORY);
455 		if (error) {
456 			warnx("%s: Could not add highmem", __func__);
457 			return (error);
458 		}
459 	}
460 
461 	error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END);
462 	if (error) {
463 		warnx("%s: Could not add VGA memory", __func__);
464 		return (error);
465 	}
466 
467 	error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END);
468 	if (error) {
469 		warnx("%s: Could not add ROM area", __func__);
470 		return (error);
471 	}
472 
473 	return (0);
474 }
475 
476 int
477 e820_finalize(void)
478 {
479 	struct qemu_fwcfg_item *e820_fwcfg_item;
480 	int error;
481 
482 	e820_fwcfg_item = e820_get_fwcfg_item();
483 	if (e820_fwcfg_item == NULL) {
484 		warnx("invalid e820 table");
485 		return (ENOMEM);
486 	}
487 	error = qemu_fwcfg_add_file("etc/e820",
488 	    e820_fwcfg_item->size, e820_fwcfg_item->data);
489 	if (error != 0) {
490 		warnx("could not add qemu fwcfg etc/e820");
491 		free(e820_fwcfg_item->data);
492 		free(e820_fwcfg_item);
493 		return (error);
494 	}
495 	free(e820_fwcfg_item);
496 
497 	return (0);
498 }
499