xref: /linux/arch/x86/boot/compressed/kaslr.c (revision e58e871becec2d3b04ed91c0c16fe8deac9c9dfa)
1 /*
2  * kaslr.c
3  *
4  * This contains the routines needed to generate a reasonable level of
5  * entropy to choose a randomized kernel base address offset in support
6  * of Kernel Address Space Layout Randomization (KASLR). Additionally
7  * handles walking the physical memory maps (and tracking memory regions
8  * to avoid) in order to select a physical memory location that can
9  * contain the entire properly aligned running kernel image.
10  *
11  */
12 #include "misc.h"
13 #include "error.h"
14 #include "../boot.h"
15 
16 #include <generated/compile.h>
17 #include <linux/module.h>
18 #include <linux/uts.h>
19 #include <linux/utsname.h>
20 #include <generated/utsrelease.h>
21 
22 /* Simplified build-specific string for starting entropy. */
23 static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
24 		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
25 
26 static unsigned long rotate_xor(unsigned long hash, const void *area,
27 				size_t size)
28 {
29 	size_t i;
30 	unsigned long *ptr = (unsigned long *)area;
31 
32 	for (i = 0; i < size / sizeof(hash); i++) {
33 		/* Rotate by odd number of bits and XOR. */
34 		hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
35 		hash ^= ptr[i];
36 	}
37 
38 	return hash;
39 }
40 
41 /* Attempt to create a simple but unpredictable starting entropy. */
42 static unsigned long get_boot_seed(void)
43 {
44 	unsigned long hash = 0;
45 
46 	hash = rotate_xor(hash, build_str, sizeof(build_str));
47 	hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
48 
49 	return hash;
50 }
51 
52 #define KASLR_COMPRESSED_BOOT
53 #include "../../lib/kaslr.c"
54 
55 struct mem_vector {
56 	unsigned long long start;
57 	unsigned long long size;
58 };
59 
60 /* Only supporting at most 4 unusable memmap regions with kaslr */
61 #define MAX_MEMMAP_REGIONS	4
62 
63 static bool memmap_too_large;
64 
65 enum mem_avoid_index {
66 	MEM_AVOID_ZO_RANGE = 0,
67 	MEM_AVOID_INITRD,
68 	MEM_AVOID_CMDLINE,
69 	MEM_AVOID_BOOTPARAMS,
70 	MEM_AVOID_MEMMAP_BEGIN,
71 	MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
72 	MEM_AVOID_MAX,
73 };
74 
75 static struct mem_vector mem_avoid[MEM_AVOID_MAX];
76 
77 static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
78 {
79 	/* Item one is entirely before item two. */
80 	if (one->start + one->size <= two->start)
81 		return false;
82 	/* Item one is entirely after item two. */
83 	if (one->start >= two->start + two->size)
84 		return false;
85 	return true;
86 }
87 
88 /**
89  *	_memparse - Parse a string with mem suffixes into a number
90  *	@ptr: Where parse begins
91  *	@retptr: (output) Optional pointer to next char after parse completes
92  *
93  *	Parses a string into a number.  The number stored at @ptr is
94  *	potentially suffixed with K, M, G, T, P, E.
95  */
96 static unsigned long long _memparse(const char *ptr, char **retptr)
97 {
98 	char *endptr;	/* Local pointer to end of parsed string */
99 
100 	unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
101 
102 	switch (*endptr) {
103 	case 'E':
104 	case 'e':
105 		ret <<= 10;
106 	case 'P':
107 	case 'p':
108 		ret <<= 10;
109 	case 'T':
110 	case 't':
111 		ret <<= 10;
112 	case 'G':
113 	case 'g':
114 		ret <<= 10;
115 	case 'M':
116 	case 'm':
117 		ret <<= 10;
118 	case 'K':
119 	case 'k':
120 		ret <<= 10;
121 		endptr++;
122 	default:
123 		break;
124 	}
125 
126 	if (retptr)
127 		*retptr = endptr;
128 
129 	return ret;
130 }
131 
132 static int
133 parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
134 {
135 	char *oldp;
136 
137 	if (!p)
138 		return -EINVAL;
139 
140 	/* We don't care about this option here */
141 	if (!strncmp(p, "exactmap", 8))
142 		return -EINVAL;
143 
144 	oldp = p;
145 	*size = _memparse(p, &p);
146 	if (p == oldp)
147 		return -EINVAL;
148 
149 	switch (*p) {
150 	case '@':
151 		/* Skip this region, usable */
152 		*start = 0;
153 		*size = 0;
154 		return 0;
155 	case '#':
156 	case '$':
157 	case '!':
158 		*start = _memparse(p + 1, &p);
159 		return 0;
160 	}
161 
162 	return -EINVAL;
163 }
164 
165 static void mem_avoid_memmap(void)
166 {
167 	char arg[128];
168 	int rc;
169 	int i;
170 	char *str;
171 
172 	/* See if we have any memmap areas */
173 	rc = cmdline_find_option("memmap", arg, sizeof(arg));
174 	if (rc <= 0)
175 		return;
176 
177 	i = 0;
178 	str = arg;
179 	while (str && (i < MAX_MEMMAP_REGIONS)) {
180 		int rc;
181 		unsigned long long start, size;
182 		char *k = strchr(str, ',');
183 
184 		if (k)
185 			*k++ = 0;
186 
187 		rc = parse_memmap(str, &start, &size);
188 		if (rc < 0)
189 			break;
190 		str = k;
191 		/* A usable region that should not be skipped */
192 		if (size == 0)
193 			continue;
194 
195 		mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
196 		mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
197 		i++;
198 	}
199 
200 	/* More than 4 memmaps, fail kaslr */
201 	if ((i >= MAX_MEMMAP_REGIONS) && str)
202 		memmap_too_large = true;
203 }
204 
205 /*
206  * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
207  * The mem_avoid array is used to store the ranges that need to be avoided
208  * when KASLR searches for an appropriate random address. We must avoid any
209  * regions that are unsafe to overlap with during decompression, and other
210  * things like the initrd, cmdline and boot_params. This comment seeks to
211  * explain mem_avoid as clearly as possible since incorrect mem_avoid
212  * memory ranges lead to really hard to debug boot failures.
213  *
214  * The initrd, cmdline, and boot_params are trivial to identify for
215  * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
216  * MEM_AVOID_BOOTPARAMS respectively below.
217  *
218  * What is not obvious how to avoid is the range of memory that is used
219  * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
220  * the compressed kernel (ZO) and its run space, which is used to extract
221  * the uncompressed kernel (VO) and relocs.
222  *
223  * ZO's full run size sits against the end of the decompression buffer, so
224  * we can calculate where text, data, bss, etc of ZO are positioned more
225  * easily.
226  *
227  * For additional background, the decompression calculations can be found
228  * in header.S, and the memory diagram is based on the one found in misc.c.
229  *
230  * The following conditions are already enforced by the image layouts and
231  * associated code:
232  *  - input + input_size >= output + output_size
233  *  - kernel_total_size <= init_size
234  *  - kernel_total_size <= output_size (see Note below)
235  *  - output + init_size >= output + output_size
236  *
237  * (Note that kernel_total_size and output_size have no fundamental
238  * relationship, but output_size is passed to choose_random_location
239  * as a maximum of the two. The diagram is showing a case where
240  * kernel_total_size is larger than output_size, but this case is
241  * handled by bumping output_size.)
242  *
243  * The above conditions can be illustrated by a diagram:
244  *
245  * 0   output            input            input+input_size    output+init_size
246  * |     |                 |                             |             |
247  * |     |                 |                             |             |
248  * |-----|--------|--------|--------------|-----------|--|-------------|
249  *                |                       |           |
250  *                |                       |           |
251  * output+init_size-ZO_INIT_SIZE  output+output_size  output+kernel_total_size
252  *
253  * [output, output+init_size) is the entire memory range used for
254  * extracting the compressed image.
255  *
256  * [output, output+kernel_total_size) is the range needed for the
257  * uncompressed kernel (VO) and its run size (bss, brk, etc).
258  *
259  * [output, output+output_size) is VO plus relocs (i.e. the entire
260  * uncompressed payload contained by ZO). This is the area of the buffer
261  * written to during decompression.
262  *
263  * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
264  * range of the copied ZO and decompression code. (i.e. the range
265  * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
266  *
267  * [input, input+input_size) is the original copied compressed image (ZO)
268  * (i.e. it does not include its run size). This range must be avoided
269  * because it contains the data used for decompression.
270  *
271  * [input+input_size, output+init_size) is [_text, _end) for ZO. This
272  * range includes ZO's heap and stack, and must be avoided since it
273  * performs the decompression.
274  *
275  * Since the above two ranges need to be avoided and they are adjacent,
276  * they can be merged, resulting in: [input, output+init_size) which
277  * becomes the MEM_AVOID_ZO_RANGE below.
278  */
279 static void mem_avoid_init(unsigned long input, unsigned long input_size,
280 			   unsigned long output)
281 {
282 	unsigned long init_size = boot_params->hdr.init_size;
283 	u64 initrd_start, initrd_size;
284 	u64 cmd_line, cmd_line_size;
285 	char *ptr;
286 
287 	/*
288 	 * Avoid the region that is unsafe to overlap during
289 	 * decompression.
290 	 */
291 	mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
292 	mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
293 	add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
294 			 mem_avoid[MEM_AVOID_ZO_RANGE].size);
295 
296 	/* Avoid initrd. */
297 	initrd_start  = (u64)boot_params->ext_ramdisk_image << 32;
298 	initrd_start |= boot_params->hdr.ramdisk_image;
299 	initrd_size  = (u64)boot_params->ext_ramdisk_size << 32;
300 	initrd_size |= boot_params->hdr.ramdisk_size;
301 	mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
302 	mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
303 	/* No need to set mapping for initrd, it will be handled in VO. */
304 
305 	/* Avoid kernel command line. */
306 	cmd_line  = (u64)boot_params->ext_cmd_line_ptr << 32;
307 	cmd_line |= boot_params->hdr.cmd_line_ptr;
308 	/* Calculate size of cmd_line. */
309 	ptr = (char *)(unsigned long)cmd_line;
310 	for (cmd_line_size = 0; ptr[cmd_line_size++]; )
311 		;
312 	mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
313 	mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
314 	add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
315 			 mem_avoid[MEM_AVOID_CMDLINE].size);
316 
317 	/* Avoid boot parameters. */
318 	mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
319 	mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
320 	add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
321 			 mem_avoid[MEM_AVOID_BOOTPARAMS].size);
322 
323 	/* We don't need to set a mapping for setup_data. */
324 
325 	/* Mark the memmap regions we need to avoid */
326 	mem_avoid_memmap();
327 
328 #ifdef CONFIG_X86_VERBOSE_BOOTUP
329 	/* Make sure video RAM can be used. */
330 	add_identity_map(0, PMD_SIZE);
331 #endif
332 }
333 
334 /*
335  * Does this memory vector overlap a known avoided area? If so, record the
336  * overlap region with the lowest address.
337  */
338 static bool mem_avoid_overlap(struct mem_vector *img,
339 			      struct mem_vector *overlap)
340 {
341 	int i;
342 	struct setup_data *ptr;
343 	unsigned long earliest = img->start + img->size;
344 	bool is_overlapping = false;
345 
346 	for (i = 0; i < MEM_AVOID_MAX; i++) {
347 		if (mem_overlaps(img, &mem_avoid[i]) &&
348 		    mem_avoid[i].start < earliest) {
349 			*overlap = mem_avoid[i];
350 			earliest = overlap->start;
351 			is_overlapping = true;
352 		}
353 	}
354 
355 	/* Avoid all entries in the setup_data linked list. */
356 	ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
357 	while (ptr) {
358 		struct mem_vector avoid;
359 
360 		avoid.start = (unsigned long)ptr;
361 		avoid.size = sizeof(*ptr) + ptr->len;
362 
363 		if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
364 			*overlap = avoid;
365 			earliest = overlap->start;
366 			is_overlapping = true;
367 		}
368 
369 		ptr = (struct setup_data *)(unsigned long)ptr->next;
370 	}
371 
372 	return is_overlapping;
373 }
374 
375 struct slot_area {
376 	unsigned long addr;
377 	int num;
378 };
379 
380 #define MAX_SLOT_AREA 100
381 
382 static struct slot_area slot_areas[MAX_SLOT_AREA];
383 
384 static unsigned long slot_max;
385 
386 static unsigned long slot_area_index;
387 
388 static void store_slot_info(struct mem_vector *region, unsigned long image_size)
389 {
390 	struct slot_area slot_area;
391 
392 	if (slot_area_index == MAX_SLOT_AREA)
393 		return;
394 
395 	slot_area.addr = region->start;
396 	slot_area.num = (region->size - image_size) /
397 			CONFIG_PHYSICAL_ALIGN + 1;
398 
399 	if (slot_area.num > 0) {
400 		slot_areas[slot_area_index++] = slot_area;
401 		slot_max += slot_area.num;
402 	}
403 }
404 
405 static unsigned long slots_fetch_random(void)
406 {
407 	unsigned long slot;
408 	int i;
409 
410 	/* Handle case of no slots stored. */
411 	if (slot_max == 0)
412 		return 0;
413 
414 	slot = kaslr_get_random_long("Physical") % slot_max;
415 
416 	for (i = 0; i < slot_area_index; i++) {
417 		if (slot >= slot_areas[i].num) {
418 			slot -= slot_areas[i].num;
419 			continue;
420 		}
421 		return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
422 	}
423 
424 	if (i == slot_area_index)
425 		debug_putstr("slots_fetch_random() failed!?\n");
426 	return 0;
427 }
428 
429 static void process_e820_entry(struct boot_e820_entry *entry,
430 			       unsigned long minimum,
431 			       unsigned long image_size)
432 {
433 	struct mem_vector region, overlap;
434 	struct slot_area slot_area;
435 	unsigned long start_orig;
436 
437 	/* Skip non-RAM entries. */
438 	if (entry->type != E820_TYPE_RAM)
439 		return;
440 
441 	/* On 32-bit, ignore entries entirely above our maximum. */
442 	if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
443 		return;
444 
445 	/* Ignore entries entirely below our minimum. */
446 	if (entry->addr + entry->size < minimum)
447 		return;
448 
449 	region.start = entry->addr;
450 	region.size = entry->size;
451 
452 	/* Give up if slot area array is full. */
453 	while (slot_area_index < MAX_SLOT_AREA) {
454 		start_orig = region.start;
455 
456 		/* Potentially raise address to minimum location. */
457 		if (region.start < minimum)
458 			region.start = minimum;
459 
460 		/* Potentially raise address to meet alignment needs. */
461 		region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
462 
463 		/* Did we raise the address above this e820 region? */
464 		if (region.start > entry->addr + entry->size)
465 			return;
466 
467 		/* Reduce size by any delta from the original address. */
468 		region.size -= region.start - start_orig;
469 
470 		/* On 32-bit, reduce region size to fit within max size. */
471 		if (IS_ENABLED(CONFIG_X86_32) &&
472 		    region.start + region.size > KERNEL_IMAGE_SIZE)
473 			region.size = KERNEL_IMAGE_SIZE - region.start;
474 
475 		/* Return if region can't contain decompressed kernel */
476 		if (region.size < image_size)
477 			return;
478 
479 		/* If nothing overlaps, store the region and return. */
480 		if (!mem_avoid_overlap(&region, &overlap)) {
481 			store_slot_info(&region, image_size);
482 			return;
483 		}
484 
485 		/* Store beginning of region if holds at least image_size. */
486 		if (overlap.start > region.start + image_size) {
487 			struct mem_vector beginning;
488 
489 			beginning.start = region.start;
490 			beginning.size = overlap.start - region.start;
491 			store_slot_info(&beginning, image_size);
492 		}
493 
494 		/* Return if overlap extends to or past end of region. */
495 		if (overlap.start + overlap.size >= region.start + region.size)
496 			return;
497 
498 		/* Clip off the overlapping region and start over. */
499 		region.size -= overlap.start - region.start + overlap.size;
500 		region.start = overlap.start + overlap.size;
501 	}
502 }
503 
504 static unsigned long find_random_phys_addr(unsigned long minimum,
505 					   unsigned long image_size)
506 {
507 	int i;
508 	unsigned long addr;
509 
510 	/* Check if we had too many memmaps. */
511 	if (memmap_too_large) {
512 		debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
513 		return 0;
514 	}
515 
516 	/* Make sure minimum is aligned. */
517 	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
518 
519 	/* Verify potential e820 positions, appending to slots list. */
520 	for (i = 0; i < boot_params->e820_entries; i++) {
521 		process_e820_entry(&boot_params->e820_table[i], minimum,
522 				   image_size);
523 		if (slot_area_index == MAX_SLOT_AREA) {
524 			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
525 			break;
526 		}
527 	}
528 
529 	return slots_fetch_random();
530 }
531 
532 static unsigned long find_random_virt_addr(unsigned long minimum,
533 					   unsigned long image_size)
534 {
535 	unsigned long slots, random_addr;
536 
537 	/* Make sure minimum is aligned. */
538 	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
539 	/* Align image_size for easy slot calculations. */
540 	image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
541 
542 	/*
543 	 * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
544 	 * that can hold image_size within the range of minimum to
545 	 * KERNEL_IMAGE_SIZE?
546 	 */
547 	slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
548 		 CONFIG_PHYSICAL_ALIGN + 1;
549 
550 	random_addr = kaslr_get_random_long("Virtual") % slots;
551 
552 	return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
553 }
554 
555 /*
556  * Since this function examines addresses much more numerically,
557  * it takes the input and output pointers as 'unsigned long'.
558  */
559 void choose_random_location(unsigned long input,
560 			    unsigned long input_size,
561 			    unsigned long *output,
562 			    unsigned long output_size,
563 			    unsigned long *virt_addr)
564 {
565 	unsigned long random_addr, min_addr;
566 
567 	/* By default, keep output position unchanged. */
568 	*virt_addr = *output;
569 
570 	if (cmdline_find_option_bool("nokaslr")) {
571 		warn("KASLR disabled: 'nokaslr' on cmdline.");
572 		return;
573 	}
574 
575 	boot_params->hdr.loadflags |= KASLR_FLAG;
576 
577 	/* Prepare to add new identity pagetables on demand. */
578 	initialize_identity_maps();
579 
580 	/* Record the various known unsafe memory ranges. */
581 	mem_avoid_init(input, input_size, *output);
582 
583 	/*
584 	 * Low end of the randomization range should be the
585 	 * smaller of 512M or the initial kernel image
586 	 * location:
587 	 */
588 	min_addr = min(*output, 512UL << 20);
589 
590 	/* Walk e820 and find a random address. */
591 	random_addr = find_random_phys_addr(min_addr, output_size);
592 	if (!random_addr) {
593 		warn("Physical KASLR disabled: no suitable memory region!");
594 	} else {
595 		/* Update the new physical address location. */
596 		if (*output != random_addr) {
597 			add_identity_map(random_addr, output_size);
598 			*output = random_addr;
599 		}
600 
601 		/*
602 		 * This loads the identity mapping page table.
603 		 * This should only be done if a new physical address
604 		 * is found for the kernel, otherwise we should keep
605 		 * the old page table to make it be like the "nokaslr"
606 		 * case.
607 		 */
608 		finalize_identity_maps();
609 	}
610 
611 
612 	/* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
613 	if (IS_ENABLED(CONFIG_X86_64))
614 		random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
615 	*virt_addr = random_addr;
616 }
617