xref: /linux/mm/execmem.c (revision cb2e1c2136f71618142557ceca3a8802e87a44cd)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2002 Richard Henderson
4   * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
5   * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
6   * Copyright (C) 2024 Mike Rapoport IBM.
7   */
8  
9  #define pr_fmt(fmt) "execmem: " fmt
10  
11  #include <linux/mm.h>
12  #include <linux/mutex.h>
13  #include <linux/vmalloc.h>
14  #include <linux/execmem.h>
15  #include <linux/maple_tree.h>
16  #include <linux/set_memory.h>
17  #include <linux/moduleloader.h>
18  #include <linux/text-patching.h>
19  
20  #include <asm/tlbflush.h>
21  
22  #include "internal.h"
23  
24  static struct execmem_info *execmem_info __ro_after_init;
25  static struct execmem_info default_execmem_info __ro_after_init;
26  
27  #ifdef CONFIG_MMU
28  static void *execmem_vmalloc(struct execmem_range *range, size_t size,
29  			     pgprot_t pgprot, unsigned long vm_flags)
30  {
31  	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
32  	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
33  	unsigned int align = range->alignment;
34  	unsigned long start = range->start;
35  	unsigned long end = range->end;
36  	void *p;
37  
38  	if (kasan)
39  		vm_flags |= VM_DEFER_KMEMLEAK;
40  
41  	if (vm_flags & VM_ALLOW_HUGE_VMAP)
42  		align = PMD_SIZE;
43  
44  	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
45  				 pgprot, vm_flags, NUMA_NO_NODE,
46  				 __builtin_return_address(0));
47  	if (!p && range->fallback_start) {
48  		start = range->fallback_start;
49  		end = range->fallback_end;
50  		p = __vmalloc_node_range(size, align, start, end, gfp_flags,
51  					 pgprot, vm_flags, NUMA_NO_NODE,
52  					 __builtin_return_address(0));
53  	}
54  
55  	if (!p) {
56  		pr_warn_ratelimited("unable to allocate memory\n");
57  		return NULL;
58  	}
59  
60  	if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
61  		vfree(p);
62  		return NULL;
63  	}
64  
65  	return p;
66  }
67  
68  struct vm_struct *execmem_vmap(size_t size)
69  {
70  	struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA];
71  	struct vm_struct *area;
72  
73  	area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
74  				  range->start, range->end, NUMA_NO_NODE,
75  				  GFP_KERNEL, __builtin_return_address(0));
76  	if (!area && range->fallback_start)
77  		area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC,
78  					  range->fallback_start, range->fallback_end,
79  					  NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0));
80  
81  	return area;
82  }
83  #else
84  static void *execmem_vmalloc(struct execmem_range *range, size_t size,
85  			     pgprot_t pgprot, unsigned long vm_flags)
86  {
87  	return vmalloc(size);
88  }
89  #endif /* CONFIG_MMU */
90  
91  #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
92  struct execmem_cache {
93  	struct mutex mutex;
94  	struct maple_tree busy_areas;
95  	struct maple_tree free_areas;
96  };
97  
98  static struct execmem_cache execmem_cache = {
99  	.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
100  	.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
101  				     execmem_cache.mutex),
102  	.free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
103  				     execmem_cache.mutex),
104  };
105  
106  static inline unsigned long mas_range_len(struct ma_state *mas)
107  {
108  	return mas->last - mas->index + 1;
109  }
110  
111  static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid)
112  {
113  	unsigned int nr = (1 << get_vm_area_page_order(vm));
114  	unsigned int updated = 0;
115  	int err = 0;
116  
117  	for (int i = 0; i < vm->nr_pages; i += nr) {
118  		err = set_direct_map_valid_noflush(vm->pages[i], nr, valid);
119  		if (err)
120  			goto err_restore;
121  		updated += nr;
122  	}
123  
124  	return 0;
125  
126  err_restore:
127  	for (int i = 0; i < updated; i += nr)
128  		set_direct_map_valid_noflush(vm->pages[i], nr, !valid);
129  
130  	return err;
131  }
132  
133  static void execmem_cache_clean(struct work_struct *work)
134  {
135  	struct maple_tree *free_areas = &execmem_cache.free_areas;
136  	struct mutex *mutex = &execmem_cache.mutex;
137  	MA_STATE(mas, free_areas, 0, ULONG_MAX);
138  	void *area;
139  
140  	mutex_lock(mutex);
141  	mas_for_each(&mas, area, ULONG_MAX) {
142  		size_t size = mas_range_len(&mas);
143  
144  		if (IS_ALIGNED(size, PMD_SIZE) &&
145  		    IS_ALIGNED(mas.index, PMD_SIZE)) {
146  			struct vm_struct *vm = find_vm_area(area);
147  
148  			execmem_set_direct_map_valid(vm, true);
149  			mas_store_gfp(&mas, NULL, GFP_KERNEL);
150  			vfree(area);
151  		}
152  	}
153  	mutex_unlock(mutex);
154  }
155  
156  static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
157  
158  static int execmem_cache_add(void *ptr, size_t size)
159  {
160  	struct maple_tree *free_areas = &execmem_cache.free_areas;
161  	struct mutex *mutex = &execmem_cache.mutex;
162  	unsigned long addr = (unsigned long)ptr;
163  	MA_STATE(mas, free_areas, addr - 1, addr + 1);
164  	unsigned long lower, upper;
165  	void *area = NULL;
166  	int err;
167  
168  	lower = addr;
169  	upper = addr + size - 1;
170  
171  	mutex_lock(mutex);
172  	area = mas_walk(&mas);
173  	if (area && mas.last == addr - 1)
174  		lower = mas.index;
175  
176  	area = mas_next(&mas, ULONG_MAX);
177  	if (area && mas.index == addr + size)
178  		upper = mas.last;
179  
180  	mas_set_range(&mas, lower, upper);
181  	err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
182  	mutex_unlock(mutex);
183  	if (err)
184  		return err;
185  
186  	return 0;
187  }
188  
189  static bool within_range(struct execmem_range *range, struct ma_state *mas,
190  			 size_t size)
191  {
192  	unsigned long addr = mas->index;
193  
194  	if (addr >= range->start && addr + size < range->end)
195  		return true;
196  
197  	if (range->fallback_start &&
198  	    addr >= range->fallback_start && addr + size < range->fallback_end)
199  		return true;
200  
201  	return false;
202  }
203  
204  static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
205  {
206  	struct maple_tree *free_areas = &execmem_cache.free_areas;
207  	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
208  	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
209  	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
210  	struct mutex *mutex = &execmem_cache.mutex;
211  	unsigned long addr, last, area_size = 0;
212  	void *area, *ptr = NULL;
213  	int err;
214  
215  	mutex_lock(mutex);
216  	mas_for_each(&mas_free, area, ULONG_MAX) {
217  		area_size = mas_range_len(&mas_free);
218  
219  		if (area_size >= size && within_range(range, &mas_free, size))
220  			break;
221  	}
222  
223  	if (area_size < size)
224  		goto out_unlock;
225  
226  	addr = mas_free.index;
227  	last = mas_free.last;
228  
229  	/* insert allocated size to busy_areas at range [addr, addr + size) */
230  	mas_set_range(&mas_busy, addr, addr + size - 1);
231  	err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL);
232  	if (err)
233  		goto out_unlock;
234  
235  	mas_store_gfp(&mas_free, NULL, GFP_KERNEL);
236  	if (area_size > size) {
237  		void *ptr = (void *)(addr + size);
238  
239  		/*
240  		 * re-insert remaining free size to free_areas at range
241  		 * [addr + size, last]
242  		 */
243  		mas_set_range(&mas_free, addr + size, last);
244  		err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL);
245  		if (err) {
246  			mas_store_gfp(&mas_busy, NULL, GFP_KERNEL);
247  			goto out_unlock;
248  		}
249  	}
250  	ptr = (void *)addr;
251  
252  out_unlock:
253  	mutex_unlock(mutex);
254  	return ptr;
255  }
256  
257  static int execmem_cache_populate(struct execmem_range *range, size_t size)
258  {
259  	unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
260  	unsigned long start, end;
261  	struct vm_struct *vm;
262  	size_t alloc_size;
263  	int err = -ENOMEM;
264  	void *p;
265  
266  	alloc_size = round_up(size, PMD_SIZE);
267  	p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
268  	if (!p)
269  		return err;
270  
271  	vm = find_vm_area(p);
272  	if (!vm)
273  		goto err_free_mem;
274  
275  	/* fill memory with instructions that will trap */
276  	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
277  
278  	start = (unsigned long)p;
279  	end = start + alloc_size;
280  
281  	vunmap_range(start, end);
282  
283  	err = execmem_set_direct_map_valid(vm, false);
284  	if (err)
285  		goto err_free_mem;
286  
287  	err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
288  				       PMD_SHIFT);
289  	if (err)
290  		goto err_free_mem;
291  
292  	err = execmem_cache_add(p, alloc_size);
293  	if (err)
294  		goto err_free_mem;
295  
296  	return 0;
297  
298  err_free_mem:
299  	vfree(p);
300  	return err;
301  }
302  
303  static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
304  {
305  	void *p;
306  	int err;
307  
308  	p = __execmem_cache_alloc(range, size);
309  	if (p)
310  		return p;
311  
312  	err = execmem_cache_populate(range, size);
313  	if (err)
314  		return NULL;
315  
316  	return __execmem_cache_alloc(range, size);
317  }
318  
319  static bool execmem_cache_free(void *ptr)
320  {
321  	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
322  	struct mutex *mutex = &execmem_cache.mutex;
323  	unsigned long addr = (unsigned long)ptr;
324  	MA_STATE(mas, busy_areas, addr, addr);
325  	size_t size;
326  	void *area;
327  
328  	mutex_lock(mutex);
329  	area = mas_walk(&mas);
330  	if (!area) {
331  		mutex_unlock(mutex);
332  		return false;
333  	}
334  	size = mas_range_len(&mas);
335  
336  	mas_store_gfp(&mas, NULL, GFP_KERNEL);
337  	mutex_unlock(mutex);
338  
339  	execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
340  
341  	execmem_cache_add(ptr, size);
342  
343  	schedule_work(&execmem_cache_clean_work);
344  
345  	return true;
346  }
347  #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
348  static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
349  {
350  	return NULL;
351  }
352  
353  static bool execmem_cache_free(void *ptr)
354  {
355  	return false;
356  }
357  #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */
358  
359  void *execmem_alloc(enum execmem_type type, size_t size)
360  {
361  	struct execmem_range *range = &execmem_info->ranges[type];
362  	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
363  	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
364  	pgprot_t pgprot = range->pgprot;
365  	void *p;
366  
367  	if (use_cache)
368  		p = execmem_cache_alloc(range, size);
369  	else
370  		p = execmem_vmalloc(range, size, pgprot, vm_flags);
371  
372  	return kasan_reset_tag(p);
373  }
374  
375  void execmem_free(void *ptr)
376  {
377  	/*
378  	 * This memory may be RO, and freeing RO memory in an interrupt is not
379  	 * supported by vmalloc.
380  	 */
381  	WARN_ON(in_interrupt());
382  
383  	if (!execmem_cache_free(ptr))
384  		vfree(ptr);
385  }
386  
387  void *execmem_update_copy(void *dst, const void *src, size_t size)
388  {
389  	return text_poke_copy(dst, src, size);
390  }
391  
392  bool execmem_is_rox(enum execmem_type type)
393  {
394  	return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
395  }
396  
397  static bool execmem_validate(struct execmem_info *info)
398  {
399  	struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT];
400  
401  	if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) {
402  		pr_crit("Invalid parameters for execmem allocator, module loading will fail");
403  		return false;
404  	}
405  
406  	if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) {
407  		for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) {
408  			r = &info->ranges[i];
409  
410  			if (r->flags & EXECMEM_ROX_CACHE) {
411  				pr_warn_once("ROX cache is not supported\n");
412  				r->flags &= ~EXECMEM_ROX_CACHE;
413  			}
414  		}
415  	}
416  
417  	return true;
418  }
419  
420  static void execmem_init_missing(struct execmem_info *info)
421  {
422  	struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT];
423  
424  	for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) {
425  		struct execmem_range *r = &info->ranges[i];
426  
427  		if (!r->start) {
428  			if (i == EXECMEM_MODULE_DATA)
429  				r->pgprot = PAGE_KERNEL;
430  			else
431  				r->pgprot = default_range->pgprot;
432  			r->alignment = default_range->alignment;
433  			r->start = default_range->start;
434  			r->end = default_range->end;
435  			r->flags = default_range->flags;
436  			r->fallback_start = default_range->fallback_start;
437  			r->fallback_end = default_range->fallback_end;
438  		}
439  	}
440  }
441  
442  struct execmem_info * __weak execmem_arch_setup(void)
443  {
444  	return NULL;
445  }
446  
447  static void __init __execmem_init(void)
448  {
449  	struct execmem_info *info = execmem_arch_setup();
450  
451  	if (!info) {
452  		info = execmem_info = &default_execmem_info;
453  		info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START;
454  		info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END;
455  		info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC;
456  		info->ranges[EXECMEM_DEFAULT].alignment = 1;
457  	}
458  
459  	if (!execmem_validate(info))
460  		return;
461  
462  	execmem_init_missing(info);
463  
464  	execmem_info = info;
465  }
466  
467  #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE
468  static int __init execmem_late_init(void)
469  {
470  	__execmem_init();
471  	return 0;
472  }
473  core_initcall(execmem_late_init);
474  #else
475  void __init execmem_init(void)
476  {
477  	__execmem_init();
478  }
479  #endif
480