xref: /illumos-gate/usr/src/cmd/bhyve/amd64/e820.c (revision 6e66f8aaeedfaa2a4e1a2f8999dee4586a229518)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG
5  * Author: Corvin Köhne <c.koehne@beckhoff.com>
6  */
7 
8 #include <sys/types.h>
9 #include <sys/queue.h>
10 
11 #include <machine/vmm.h>
12 
13 #include <assert.h>
14 #include <err.h>
15 #include <errno.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 
20 #include "debug.h"
21 #include "e820.h"
22 #include "qemu_fwcfg.h"
23 
24 /*
25  * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it
26  * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't
27  * hold all possible physical addresses and we can get into trouble.
28  */
29 static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t),
30     "Unable to represent physical memory by E820 table");
31 
32 #define E820_FWCFG_FILE_NAME "etc/e820"
33 
34 #define KB (1024UL)
35 #define MB (1024 * KB)
36 #define GB (1024 * MB)
37 
38 /*
39  * Fix E820 memory holes:
40  * [    A0000,    C0000) VGA
41  * [    C0000,   100000) ROM
42  */
43 #define E820_VGA_MEM_BASE 0xA0000
44 #define E820_VGA_MEM_END 0xC0000
45 #define E820_ROM_MEM_BASE 0xC0000
46 #define E820_ROM_MEM_END 0x100000
47 
48 struct e820_element {
49 	TAILQ_ENTRY(e820_element) chain;
50 	uint64_t base;
51 	uint64_t end;
52 	enum e820_memory_type type;
53 };
54 static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER(
55     e820_table);
56 
57 static struct e820_element *
e820_element_alloc(uint64_t base,uint64_t end,enum e820_memory_type type)58 e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type)
59 {
60 	struct e820_element *element;
61 
62 	element = calloc(1, sizeof(*element));
63 	if (element == NULL) {
64 		return (NULL);
65 	}
66 
67 	element->base = base;
68 	element->end = end;
69 	element->type = type;
70 
71 	return (element);
72 }
73 
74 static const char *
e820_get_type_name(const enum e820_memory_type type)75 e820_get_type_name(const enum e820_memory_type type)
76 {
77 	switch (type) {
78 	case E820_TYPE_MEMORY:
79 		return ("RAM");
80 	case E820_TYPE_RESERVED:
81 		return ("Reserved");
82 	case E820_TYPE_ACPI:
83 		return ("ACPI");
84 	case E820_TYPE_NVS:
85 		return ("NVS");
86 	default:
87 		return ("Unknown");
88 	}
89 }
90 
91 void
e820_dump_table(void)92 e820_dump_table(void)
93 {
94 	struct e820_element *element;
95 	uint64_t i;
96 
97 	EPRINTLN("E820 map:");
98 
99 	i = 0;
100 	TAILQ_FOREACH(element, &e820_table, chain) {
101 		EPRINTLN("  (%4lu) [%16lx, %16lx] %s", i,
102 		    element->base, element->end,
103 		    e820_get_type_name(element->type));
104 
105 		++i;
106 	}
107 }
108 
109 static struct qemu_fwcfg_item *
e820_get_fwcfg_item(void)110 e820_get_fwcfg_item(void)
111 {
112 	struct qemu_fwcfg_item *fwcfg_item;
113 	struct e820_element *element;
114 	struct e820_entry *entries;
115 	int count, i;
116 
117 	count = 0;
118 	TAILQ_FOREACH(element, &e820_table, chain) {
119 		++count;
120 	}
121 	if (count == 0) {
122 		warnx("%s: E820 table empty", __func__);
123 		return (NULL);
124 	}
125 
126 	fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item));
127 	if (fwcfg_item == NULL) {
128 		return (NULL);
129 	}
130 
131 	fwcfg_item->size = count * sizeof(struct e820_entry);
132 	fwcfg_item->data = calloc(count, sizeof(struct e820_entry));
133 	if (fwcfg_item->data == NULL) {
134 		free(fwcfg_item);
135 		return (NULL);
136 	}
137 
138 	i = 0;
139 	entries = (struct e820_entry *)fwcfg_item->data;
140 	TAILQ_FOREACH(element, &e820_table, chain) {
141 		struct e820_entry *entry = &entries[i];
142 
143 		entry->base = element->base;
144 		entry->length = element->end - element->base;
145 		entry->type = element->type;
146 
147 		++i;
148 	}
149 
150 	return (fwcfg_item);
151 }
152 
153 static int
e820_add_entry(const uint64_t base,const uint64_t end,const enum e820_memory_type type)154 e820_add_entry(const uint64_t base, const uint64_t end,
155     const enum e820_memory_type type)
156 {
157 	struct e820_element *new_element;
158 	struct e820_element *element;
159 	struct e820_element *sib_element;
160 	struct e820_element *ram_element;
161 
162 	assert(end >= base);
163 
164 	new_element = e820_element_alloc(base, end, type);
165 	if (new_element == NULL) {
166 		return (ENOMEM);
167 	}
168 
169 	/*
170 	 * E820 table should always be sorted in ascending order. Therefore,
171 	 * search for a range whose end is larger than the base parameter.
172 	 */
173 	TAILQ_FOREACH(element, &e820_table, chain) {
174 		if (element->end > base) {
175 			break;
176 		}
177 	}
178 
179 	/*
180 	 * System memory requires special handling.
181 	 */
182 	if (type == E820_TYPE_MEMORY) {
183 		/*
184 		 * base is larger than of any existing element. Add new system
185 		 * memory at the end of the table.
186 		 */
187 		if (element == NULL) {
188 			TAILQ_INSERT_TAIL(&e820_table, new_element, chain);
189 			return (0);
190 		}
191 
192 		/*
193 		 * System memory shouldn't overlap with any existing element.
194 		 */
195 		assert(end >= element->base);
196 
197 		TAILQ_INSERT_BEFORE(element, new_element, chain);
198 
199 		return (0);
200 	}
201 
202 	assert(element != NULL);
203 	/* Non system memory should be allocated inside system memory. */
204 	assert(element->type == E820_TYPE_MEMORY);
205 	/* New element should fit into existing system memory element. */
206 	assert(base >= element->base && end <= element->end);
207 	if (base == element->base && end == element->end) {
208 		/*
209 		 * The new entry replaces an existing one.
210 		 *
211 		 * Old table:
212 		 *      [ 0x1000, 0x4000] RAM           <-- element
213 		 * New table:
214 		 *      [ 0x1000, 0x4000] Reserved
215 		 */
216 		TAILQ_INSERT_BEFORE(element, new_element, chain);
217 		TAILQ_REMOVE(&e820_table, element, chain);
218 		free(element);
219 	} else if (base == element->base) {
220 		/*
221 		 * New element at system memory base boundary. Add new
222 		 * element before current and adjust the base of the old
223 		 * element.
224 		 *
225 		 * Old table:
226 		 * 	[ 0x1000, 0x4000] RAM		<-- element
227 		 * New table:
228 		 * 	[ 0x1000, 0x2000] Reserved
229 		 * 	[ 0x2000, 0x4000] RAM		<-- element
230 		 */
231 		TAILQ_INSERT_BEFORE(element, new_element, chain);
232 		element->base = end;
233 	} else if (end == element->end) {
234 		/*
235 		 * New element at system memory end boundary. Add new
236 		 * element after current and adjust the end of the
237 		 * current element.
238 		 *
239 		 * Old table:
240 		 * 	[ 0x1000, 0x4000] RAM		<-- element
241 		 * New table:
242 		 * 	[ 0x1000, 0x3000] RAM		<-- element
243 		 * 	[ 0x3000, 0x4000] Reserved
244 		 */
245 		TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain);
246 		element->end = base;
247 	} else {
248 		/*
249 		 * New element inside system memory entry. Split it by
250 		 * adding a system memory element and the new element
251 		 * before current.
252 		 *
253 		 * Old table:
254 		 * 	[ 0x1000, 0x4000] RAM		<-- element
255 		 * New table:
256 		 * 	[ 0x1000, 0x2000] RAM
257 		 * 	[ 0x2000, 0x3000] Reserved
258 		 * 	[ 0x3000, 0x4000] RAM		<-- element
259 		 */
260 		ram_element = e820_element_alloc(element->base, base,
261 		    E820_TYPE_MEMORY);
262 		if (ram_element == NULL) {
263 			return (ENOMEM);
264 		}
265 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
266 		TAILQ_INSERT_BEFORE(element, new_element, chain);
267 		element->base = end;
268 	}
269 
270 	/*
271 	 * If the previous element has the same type and ends at our base
272 	 * boundary, we can merge both entries.
273 	 */
274 	sib_element = TAILQ_PREV(new_element, e820_table, chain);
275 	if (sib_element != NULL &&
276 	    sib_element->type == new_element->type &&
277 	    sib_element->end == new_element->base) {
278 		new_element->base = sib_element->base;
279 		TAILQ_REMOVE(&e820_table, sib_element, chain);
280 		free(sib_element);
281 	}
282 
283 	/*
284 	 * If the next element has the same type and starts at our end
285 	 * boundary, we can merge both entries.
286 	 */
287 	sib_element = TAILQ_NEXT(new_element, chain);
288 	if (sib_element != NULL &&
289 	    sib_element->type == new_element->type &&
290 	    sib_element->base == new_element->end) {
291 		/* Merge new element into subsequent one. */
292 		new_element->end = sib_element->end;
293 		TAILQ_REMOVE(&e820_table, sib_element, chain);
294 		free(sib_element);
295 	}
296 
297 	return (0);
298 }
299 
300 static int
e820_add_memory_hole(const uint64_t base,const uint64_t end)301 e820_add_memory_hole(const uint64_t base, const uint64_t end)
302 {
303 	struct e820_element *element;
304 	struct e820_element *ram_element;
305 
306 	assert(end >= base);
307 
308 	/*
309 	 * E820 table should be always sorted in ascending order. Therefore,
310 	 * search for an element which end is larger than the base parameter.
311 	 */
312 	TAILQ_FOREACH(element, &e820_table, chain) {
313 		if (element->end > base) {
314 			break;
315 		}
316 	}
317 
318 	if (element == NULL || end <= element->base) {
319 		/* Nothing to do. Hole already exists */
320 		return (0);
321 	}
322 
323 	/* Memory holes are only allowed in system memory */
324 	assert(element->type == E820_TYPE_MEMORY);
325 
326 	if (base == element->base) {
327 		/*
328 		 * New hole at system memory base boundary.
329 		 *
330 		 * Old table:
331 		 * 	[ 0x1000, 0x4000] RAM
332 		 * New table:
333 		 * 	[ 0x2000, 0x4000] RAM
334 		 */
335 		element->base = end;
336 	} else if (end == element->end) {
337 		/*
338 		 * New hole at system memory end boundary.
339 		 *
340 		 * Old table:
341 		 * 	[ 0x1000, 0x4000] RAM
342 		 * New table:
343 		 * 	[ 0x1000, 0x3000] RAM
344 		 */
345 		element->end = base;
346 	} else {
347 		/*
348 		 * New hole inside system memory entry. Split the system memory.
349 		 *
350 		 * Old table:
351 		 * 	[ 0x1000, 0x4000] RAM		<-- element
352 		 * New table:
353 		 * 	[ 0x1000, 0x2000] RAM
354 		 * 	[ 0x3000, 0x4000] RAM		<-- element
355 		 */
356 		ram_element = e820_element_alloc(element->base, base,
357 		    E820_TYPE_MEMORY);
358 		if (ram_element == NULL) {
359 			return (ENOMEM);
360 		}
361 		TAILQ_INSERT_BEFORE(element, ram_element, chain);
362 		element->base = end;
363 	}
364 
365 	return (0);
366 }
367 
368 static uint64_t
e820_alloc_highest(const uint64_t max_address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type)369 e820_alloc_highest(const uint64_t max_address, const uint64_t length,
370     const uint64_t alignment, const enum e820_memory_type type)
371 {
372 	struct e820_element *element;
373 
374 	TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) {
375 		uint64_t address, base, end;
376 
377 		end = MIN(max_address, element->end);
378 		base = roundup2(element->base, alignment);
379 
380 		/*
381 		 * If end - length == 0, we would allocate memory at address 0. This
382 		 * address is mostly unusable and we should avoid allocating it.
383 		 * Therefore, search for another block in that case.
384 		 */
385 		if (element->type != E820_TYPE_MEMORY || end < base ||
386 		    end - base < length || end - length == 0) {
387 			continue;
388 		}
389 
390 		address = rounddown2(end - length, alignment);
391 
392 		if (e820_add_entry(address, address + length, type) != 0) {
393 			return (0);
394 		}
395 
396 		return (address);
397 	}
398 
399 	return (0);
400 }
401 
402 static uint64_t
e820_alloc_lowest(const uint64_t min_address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type)403 e820_alloc_lowest(const uint64_t min_address, const uint64_t length,
404     const uint64_t alignment, const enum e820_memory_type type)
405 {
406 	struct e820_element *element;
407 
408 	TAILQ_FOREACH(element, &e820_table, chain) {
409 		uint64_t base, end;
410 
411 		end = element->end;
412 		base = MAX(min_address, roundup2(element->base, alignment));
413 
414 		/*
415 		 * If base == 0, we would allocate memory at address 0. This
416 		 * address is mostly unusable and we should avoid allocating it.
417 		 * Therefore, search for another block in that case.
418 		 */
419 		if (element->type != E820_TYPE_MEMORY || end < base ||
420 		    end - base < length || base == 0) {
421 			continue;
422 		}
423 
424 		if (e820_add_entry(base, base + length, type) != 0) {
425 			return (0);
426 		}
427 
428 		return (base);
429 	}
430 
431 	return (0);
432 }
433 
434 uint64_t
e820_alloc(const uint64_t address,const uint64_t length,const uint64_t alignment,const enum e820_memory_type type,const enum e820_allocation_strategy strategy)435 e820_alloc(const uint64_t address, const uint64_t length,
436     const uint64_t alignment, const enum e820_memory_type type,
437     const enum e820_allocation_strategy strategy)
438 {
439 	assert(powerof2(alignment));
440 	assert((address & (alignment - 1)) == 0);
441 
442 	switch (strategy) {
443 	case E820_ALLOCATE_ANY:
444 		/*
445 		 * Allocate any address. Therefore, ignore the address parameter
446 		 * and reuse the code path for allocating the lowest address.
447 		 */
448 		return (e820_alloc_lowest(0, length, alignment, type));
449 	case E820_ALLOCATE_LOWEST:
450 		return (e820_alloc_lowest(address, length, alignment, type));
451 	case E820_ALLOCATE_HIGHEST:
452 		return (e820_alloc_highest(address, length, alignment, type));
453 	case E820_ALLOCATE_SPECIFIC:
454 		if (e820_add_entry(address, address + length, type) != 0) {
455 			return (0);
456 		}
457 
458 		return (address);
459 	}
460 
461 	return (0);
462 }
463 
464 int
e820_init(struct vmctx * const ctx)465 e820_init(struct vmctx *const ctx)
466 {
467 	uint64_t lowmem_size, highmem_size;
468 	int error;
469 
470 	TAILQ_INIT(&e820_table);
471 
472 	lowmem_size = vm_get_lowmem_size(ctx);
473 	error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY);
474 	if (error) {
475 		warnx("%s: Could not add lowmem", __func__);
476 		return (error);
477 	}
478 
479 	highmem_size = vm_get_highmem_size(ctx);
480 	if (highmem_size != 0) {
481 		error = e820_add_entry(4 * GB, 4 * GB + highmem_size,
482 		    E820_TYPE_MEMORY);
483 		if (error) {
484 			warnx("%s: Could not add highmem", __func__);
485 			return (error);
486 		}
487 	}
488 
489 	error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END);
490 	if (error) {
491 		warnx("%s: Could not add VGA memory", __func__);
492 		return (error);
493 	}
494 
495 	error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END);
496 	if (error) {
497 		warnx("%s: Could not add ROM area", __func__);
498 		return (error);
499 	}
500 
501 	return (0);
502 }
503 
504 int
e820_finalize(void)505 e820_finalize(void)
506 {
507 	struct qemu_fwcfg_item *e820_fwcfg_item;
508 	int error;
509 
510 	e820_fwcfg_item = e820_get_fwcfg_item();
511 	if (e820_fwcfg_item == NULL) {
512 		warnx("invalid e820 table");
513 		return (ENOMEM);
514 	}
515 	error = qemu_fwcfg_add_file("etc/e820",
516 	    e820_fwcfg_item->size, e820_fwcfg_item->data);
517 	if (error != 0) {
518 		warnx("could not add qemu fwcfg etc/e820");
519 		free(e820_fwcfg_item->data);
520 		free(e820_fwcfg_item);
521 		return (error);
522 	}
523 	free(e820_fwcfg_item);
524 
525 	return (0);
526 }
527