1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * KFENCE guarded object allocator and fault handling.
4 *
5 * Copyright (C) 2020, Google LLC.
6 */
7
8 #define pr_fmt(fmt) "kfence: " fmt
9
10 #include <linux/atomic.h>
11 #include <linux/bug.h>
12 #include <linux/debugfs.h>
13 #include <linux/hash.h>
14 #include <linux/irq_work.h>
15 #include <linux/jhash.h>
16 #include <linux/kcsan-checks.h>
17 #include <linux/kfence.h>
18 #include <linux/kmemleak.h>
19 #include <linux/list.h>
20 #include <linux/lockdep.h>
21 #include <linux/log2.h>
22 #include <linux/memblock.h>
23 #include <linux/moduleparam.h>
24 #include <linux/nodemask.h>
25 #include <linux/notifier.h>
26 #include <linux/panic_notifier.h>
27 #include <linux/random.h>
28 #include <linux/rcupdate.h>
29 #include <linux/reboot.h>
30 #include <linux/sched/clock.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/spinlock.h>
34 #include <linux/string.h>
35
36 #include <asm/kfence.h>
37
38 #include "kfence.h"
39
40 /* Disables KFENCE on the first warning assuming an irrecoverable error. */
41 #define KFENCE_WARN_ON(cond) \
42 ({ \
43 const bool __cond = WARN_ON(cond); \
44 if (unlikely(__cond)) { \
45 WRITE_ONCE(kfence_enabled, false); \
46 disabled_by_warn = true; \
47 } \
48 __cond; \
49 })
50
51 /* === Data ================================================================= */
52
53 static bool kfence_enabled __read_mostly;
54 static bool disabled_by_warn __read_mostly;
55
56 unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
57 EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
58
59 #ifdef MODULE_PARAM_PREFIX
60 #undef MODULE_PARAM_PREFIX
61 #endif
62 #define MODULE_PARAM_PREFIX "kfence."
63
64 static int kfence_enable_late(void);
param_set_sample_interval(const char * val,const struct kernel_param * kp)65 static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
66 {
67 unsigned long num;
68 int ret = kstrtoul(val, 0, &num);
69
70 if (ret < 0)
71 return ret;
72
73 /* Using 0 to indicate KFENCE is disabled. */
74 if (!num && READ_ONCE(kfence_enabled)) {
75 pr_info("disabled\n");
76 WRITE_ONCE(kfence_enabled, false);
77 }
78
79 *((unsigned long *)kp->arg) = num;
80
81 if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
82 return disabled_by_warn ? -EINVAL : kfence_enable_late();
83 return 0;
84 }
85
param_get_sample_interval(char * buffer,const struct kernel_param * kp)86 static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
87 {
88 if (!READ_ONCE(kfence_enabled))
89 return sprintf(buffer, "0\n");
90
91 return param_get_ulong(buffer, kp);
92 }
93
94 static const struct kernel_param_ops sample_interval_param_ops = {
95 .set = param_set_sample_interval,
96 .get = param_get_sample_interval,
97 };
98 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
99
100 /* Pool usage% threshold when currently covered allocations are skipped. */
101 static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
102 module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
103
104 /* Allocation burst count: number of excess KFENCE allocations per sample. */
105 static unsigned int kfence_burst __read_mostly;
106 module_param_named(burst, kfence_burst, uint, 0644);
107
108 /* If true, use a deferrable timer. */
109 static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
110 module_param_named(deferrable, kfence_deferrable, bool, 0444);
111
112 /* If true, check all canary bytes on panic. */
113 static bool kfence_check_on_panic __read_mostly;
114 module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);
115
116 /* The pool of pages used for guard pages and objects. */
117 char *__kfence_pool __read_mostly;
118 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
119
120 /*
121 * Per-object metadata, with one-to-one mapping of object metadata to
122 * backing pages (in __kfence_pool).
123 */
124 static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
125 struct kfence_metadata *kfence_metadata __read_mostly;
126
127 /*
128 * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
129 * So introduce kfence_metadata_init to initialize metadata, and then make
130 * kfence_metadata visible after initialization is successful. This prevents
131 * potential UAF or access to uninitialized metadata.
132 */
133 static struct kfence_metadata *kfence_metadata_init __read_mostly;
134
135 /* Freelist with available objects. */
136 DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
137 static struct list_head kfence_freelist __guarded_by(&kfence_freelist_lock) = LIST_HEAD_INIT(kfence_freelist);
138
139 /*
140 * The static key to set up a KFENCE allocation; or if static keys are not used
141 * to gate allocations, to avoid a load and compare if KFENCE is disabled.
142 */
143 DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
144
145 /* Gates the allocation, ensuring only one succeeds in a given period. */
146 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
147
148 /*
149 * A Counting Bloom filter of allocation coverage: limits currently covered
150 * allocations of the same source filling up the pool.
151 *
152 * Assuming a range of 15%-85% unique allocations in the pool at any point in
153 * time, the below parameters provide a probablity of 0.02-0.33 for false
154 * positive hits respectively:
155 *
156 * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
157 */
158 #define ALLOC_COVERED_HNUM 2
159 #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
160 #define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
161 #define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
162 #define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
163 static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
164
165 /* Stack depth used to determine uniqueness of an allocation. */
166 #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
167
168 /*
169 * Randomness for stack hashes, making the same collisions across reboots and
170 * different machines less likely.
171 */
172 static u32 stack_hash_seed __ro_after_init;
173
174 /* Statistics counters for debugfs. */
175 enum kfence_counter_id {
176 KFENCE_COUNTER_ALLOCATED,
177 KFENCE_COUNTER_ALLOCS,
178 KFENCE_COUNTER_FREES,
179 KFENCE_COUNTER_ZOMBIES,
180 KFENCE_COUNTER_BUGS,
181 KFENCE_COUNTER_SKIP_INCOMPAT,
182 KFENCE_COUNTER_SKIP_CAPACITY,
183 KFENCE_COUNTER_SKIP_COVERED,
184 KFENCE_COUNTER_COUNT,
185 };
186 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
187 static const char *const counter_names[] = {
188 [KFENCE_COUNTER_ALLOCATED] = "currently allocated",
189 [KFENCE_COUNTER_ALLOCS] = "total allocations",
190 [KFENCE_COUNTER_FREES] = "total frees",
191 [KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
192 [KFENCE_COUNTER_BUGS] = "total bugs",
193 [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
194 [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
195 [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
196 };
197 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
198
199 /* === Internals ============================================================ */
200
should_skip_covered(void)201 static inline bool should_skip_covered(void)
202 {
203 unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
204
205 return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
206 }
207
get_alloc_stack_hash(unsigned long * stack_entries,size_t num_entries)208 static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
209 {
210 num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
211 num_entries = filter_irq_stacks(stack_entries, num_entries);
212 return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
213 }
214
215 /*
216 * Adds (or subtracts) count @val for allocation stack trace hash
217 * @alloc_stack_hash from Counting Bloom filter.
218 */
alloc_covered_add(u32 alloc_stack_hash,int val)219 static void alloc_covered_add(u32 alloc_stack_hash, int val)
220 {
221 int i;
222
223 for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
224 atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
225 alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
226 }
227 }
228
229 /*
230 * Returns true if the allocation stack trace hash @alloc_stack_hash is
231 * currently contained (non-zero count) in Counting Bloom filter.
232 */
alloc_covered_contains(u32 alloc_stack_hash)233 static bool alloc_covered_contains(u32 alloc_stack_hash)
234 {
235 int i;
236
237 for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
238 if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
239 return false;
240 alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
241 }
242
243 return true;
244 }
245
kfence_protect(unsigned long addr)246 static bool kfence_protect(unsigned long addr)
247 {
248 return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
249 }
250
kfence_unprotect(unsigned long addr)251 static bool kfence_unprotect(unsigned long addr)
252 {
253 return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
254 }
255
metadata_to_pageaddr(const struct kfence_metadata * meta)256 static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
257 __must_hold(&meta->lock)
258 {
259 unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
260 unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
261
262 /* The checks do not affect performance; only called from slow-paths. */
263
264 /* Only call with a pointer into kfence_metadata. */
265 if (KFENCE_WARN_ON(meta < kfence_metadata ||
266 meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
267 return 0;
268
269 /*
270 * This metadata object only ever maps to 1 page; verify that the stored
271 * address is in the expected range.
272 */
273 if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
274 return 0;
275
276 return pageaddr;
277 }
278
kfence_obj_allocated(const struct kfence_metadata * meta)279 static inline bool kfence_obj_allocated(const struct kfence_metadata *meta)
280 {
281 enum kfence_object_state state = READ_ONCE(meta->state);
282
283 return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING;
284 }
285
286 /*
287 * Update the object's metadata state, including updating the alloc/free stacks
288 * depending on the state transition.
289 */
290 static noinline void
metadata_update_state(struct kfence_metadata * meta,enum kfence_object_state next,unsigned long * stack_entries,size_t num_stack_entries)291 metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
292 unsigned long *stack_entries, size_t num_stack_entries)
293 __must_hold(&meta->lock)
294 {
295 struct kfence_track *track =
296 next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track;
297
298 lockdep_assert_held(&meta->lock);
299
300 /* Stack has been saved when calling rcu, skip. */
301 if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING)
302 goto out;
303
304 if (stack_entries) {
305 memcpy(track->stack_entries, stack_entries,
306 num_stack_entries * sizeof(stack_entries[0]));
307 } else {
308 /*
309 * Skip over 1 (this) functions; noinline ensures we do not
310 * accidentally skip over the caller by never inlining.
311 */
312 num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
313 }
314 track->num_stack_entries = num_stack_entries;
315 track->pid = task_pid_nr(current);
316 track->cpu = raw_smp_processor_id();
317 track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
318
319 out:
320 /*
321 * Pairs with READ_ONCE() in
322 * kfence_shutdown_cache(),
323 * kfence_handle_page_fault().
324 */
325 WRITE_ONCE(meta->state, next);
326 }
327
328 #ifdef CONFIG_KMSAN
329 #define check_canary_attributes noinline __no_kmsan_checks
330 #else
331 #define check_canary_attributes inline
332 #endif
333
334 /* Check canary byte at @addr. */
check_canary_byte(u8 * addr)335 static check_canary_attributes bool check_canary_byte(u8 *addr)
336 {
337 struct kfence_metadata *meta;
338 unsigned long flags;
339
340 if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
341 return true;
342
343 atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
344
345 meta = addr_to_metadata((unsigned long)addr);
346 raw_spin_lock_irqsave(&meta->lock, flags);
347 kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
348 raw_spin_unlock_irqrestore(&meta->lock, flags);
349
350 return false;
351 }
352
set_canary(const struct kfence_metadata * meta)353 static inline void set_canary(const struct kfence_metadata *meta)
354 {
355 const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
356 unsigned long addr = pageaddr;
357
358 /*
359 * The canary may be written to part of the object memory, but it does
360 * not affect it. The user should initialize the object before using it.
361 */
362 for (; addr < meta->addr; addr += sizeof(u64))
363 *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
364
365 addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
366 for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
367 *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
368 }
369
370 static check_canary_attributes void
check_canary(const struct kfence_metadata * meta)371 check_canary(const struct kfence_metadata *meta)
372 {
373 const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
374 unsigned long addr = pageaddr;
375
376 /*
377 * We'll iterate over each canary byte per-side until a corrupted byte
378 * is found. However, we'll still iterate over the canary bytes to the
379 * right of the object even if there was an error in the canary bytes to
380 * the left of the object. Specifically, if check_canary_byte()
381 * generates an error, showing both sides might give more clues as to
382 * what the error is about when displaying which bytes were corrupted.
383 */
384
385 /* Apply to left of object. */
386 for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
387 if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
388 break;
389 }
390
391 /*
392 * If the canary is corrupted in a certain 64 bytes, or the canary
393 * memory cannot be completely covered by multiple consecutive 64 bytes,
394 * it needs to be checked one by one.
395 */
396 for (; addr < meta->addr; addr++) {
397 if (unlikely(!check_canary_byte((u8 *)addr)))
398 break;
399 }
400
401 /* Apply to right of object. */
402 for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) {
403 if (unlikely(!check_canary_byte((u8 *)addr)))
404 return;
405 }
406 for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) {
407 if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) {
408
409 for (; addr - pageaddr < PAGE_SIZE; addr++) {
410 if (!check_canary_byte((u8 *)addr))
411 return;
412 }
413 }
414 }
415 }
416
kfence_guarded_alloc(struct kmem_cache * cache,size_t size,gfp_t gfp,unsigned long * stack_entries,size_t num_stack_entries,u32 alloc_stack_hash)417 static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
418 unsigned long *stack_entries, size_t num_stack_entries,
419 u32 alloc_stack_hash)
420 {
421 struct kfence_metadata *meta = NULL;
422 unsigned long flags;
423 struct slab *slab;
424 void *addr;
425 const bool random_right_allocate = get_random_u32_below(2);
426 const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
427 !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);
428
429 /* Try to obtain a free object. */
430 raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
431 if (!list_empty(&kfence_freelist)) {
432 meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
433 list_del_init(&meta->list);
434 }
435 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
436 if (!meta) {
437 atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
438 return NULL;
439 }
440
441 if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
442 /*
443 * This is extremely unlikely -- we are reporting on a
444 * use-after-free, which locked meta->lock, and the reporting
445 * code via printk calls kmalloc() which ends up in
446 * kfence_alloc() and tries to grab the same object that we're
447 * reporting on. While it has never been observed, lockdep does
448 * report that there is a possibility of deadlock. Fix it by
449 * using trylock and bailing out gracefully.
450 */
451 raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
452 /* Put the object back on the freelist. */
453 list_add_tail(&meta->list, &kfence_freelist);
454 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
455
456 return NULL;
457 }
458
459 meta->addr = metadata_to_pageaddr(meta);
460 /* Unprotect if we're reusing this page. */
461 if (meta->state == KFENCE_OBJECT_FREED)
462 kfence_unprotect(meta->addr);
463
464 /*
465 * Note: for allocations made before RNG initialization, will always
466 * return zero. We still benefit from enabling KFENCE as early as
467 * possible, even when the RNG is not yet available, as this will allow
468 * KFENCE to detect bugs due to earlier allocations. The only downside
469 * is that the out-of-bounds accesses detected are deterministic for
470 * such allocations.
471 */
472 if (random_right_allocate) {
473 /* Allocate on the "right" side, re-calculate address. */
474 meta->addr += PAGE_SIZE - size;
475 meta->addr = ALIGN_DOWN(meta->addr, cache->align);
476 }
477
478 addr = (void *)meta->addr;
479
480 /* Update remaining metadata. */
481 metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
482 /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
483 WRITE_ONCE(meta->cache, cache);
484 meta->size = size;
485 meta->alloc_stack_hash = alloc_stack_hash;
486 raw_spin_unlock_irqrestore(&meta->lock, flags);
487
488 alloc_covered_add(alloc_stack_hash, 1);
489
490 /* Set required slab fields. */
491 slab = virt_to_slab(addr);
492 slab->slab_cache = cache;
493 slab->objects = 1;
494
495 /* Memory initialization. */
496 set_canary(meta);
497
498 /*
499 * We check slab_want_init_on_alloc() ourselves, rather than letting
500 * SL*B do the initialization, as otherwise we might overwrite KFENCE's
501 * redzone.
502 */
503 if (unlikely(slab_want_init_on_alloc(gfp, cache)))
504 memzero_explicit(addr, size);
505 if (cache->ctor)
506 cache->ctor(addr);
507
508 if (random_fault)
509 kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
510
511 atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
512 atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
513
514 return addr;
515 }
516
kfence_guarded_free(void * addr,struct kfence_metadata * meta,bool zombie)517 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
518 {
519 struct kcsan_scoped_access assert_page_exclusive;
520 u32 alloc_stack_hash;
521 unsigned long flags;
522 bool init;
523
524 raw_spin_lock_irqsave(&meta->lock, flags);
525
526 if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) {
527 /* Invalid or double-free, bail out. */
528 atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
529 kfence_report_error((unsigned long)addr, false, NULL, meta,
530 KFENCE_ERROR_INVALID_FREE);
531 raw_spin_unlock_irqrestore(&meta->lock, flags);
532 return;
533 }
534
535 /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
536 kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
537 KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
538 &assert_page_exclusive);
539
540 if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
541 kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
542
543 /* Restore page protection if there was an OOB access. */
544 if (meta->unprotected_page) {
545 memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
546 kfence_protect(meta->unprotected_page);
547 meta->unprotected_page = 0;
548 }
549
550 /* Mark the object as freed. */
551 metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
552 init = slab_want_init_on_free(meta->cache);
553 alloc_stack_hash = meta->alloc_stack_hash;
554 raw_spin_unlock_irqrestore(&meta->lock, flags);
555
556 alloc_covered_add(alloc_stack_hash, -1);
557
558 /* Check canary bytes for memory corruption. */
559 check_canary(meta);
560
561 /*
562 * Clear memory if init-on-free is set. While we protect the page, the
563 * data is still there, and after a use-after-free is detected, we
564 * unprotect the page, so the data is still accessible.
565 */
566 if (!zombie && unlikely(init))
567 memzero_explicit(addr, meta->size);
568
569 /* Protect to detect use-after-frees. */
570 kfence_protect((unsigned long)addr);
571
572 kcsan_end_scoped_access(&assert_page_exclusive);
573 if (!zombie) {
574 /* Add it to the tail of the freelist for reuse. */
575 raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
576 KFENCE_WARN_ON(!list_empty(&meta->list));
577 list_add_tail(&meta->list, &kfence_freelist);
578 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
579
580 atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
581 atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
582 } else {
583 /* See kfence_shutdown_cache(). */
584 atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
585 }
586 }
587
rcu_guarded_free(struct rcu_head * h)588 static void rcu_guarded_free(struct rcu_head *h)
589 {
590 struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
591
592 kfence_guarded_free((void *)meta->addr, meta, false);
593 }
594
595 /*
596 * Initialization of the KFENCE pool after its allocation.
597 * Returns 0 on success; otherwise returns the address up to
598 * which partial initialization succeeded.
599 */
kfence_init_pool(void)600 static unsigned long kfence_init_pool(void)
601 __context_unsafe(/* constructor */)
602 {
603 unsigned long addr, start_pfn;
604 int i, rand;
605
606 if (!arch_kfence_init_pool())
607 return (unsigned long)__kfence_pool;
608
609 addr = (unsigned long)__kfence_pool;
610 start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool));
611
612 /*
613 * Set up object pages: they must have PGTY_slab set to avoid freeing
614 * them as real pages.
615 *
616 * We also want to avoid inserting kfence_free() in the kfree()
617 * fast-path in SLUB, and therefore need to ensure kfree() correctly
618 * enters __slab_free() slow-path.
619 */
620 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
621 struct page *page;
622
623 if (!i || (i % 2))
624 continue;
625
626 page = pfn_to_page(start_pfn + i);
627 __SetPageSlab(page);
628 #ifdef CONFIG_MEMCG
629 struct slab *slab = page_slab(page);
630 slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
631 MEMCG_DATA_OBJEXTS;
632 #endif
633 }
634
635 /*
636 * Protect the first 2 pages. The first page is mostly unnecessary, and
637 * merely serves as an extended guard page. However, adding one
638 * additional page in the beginning gives us an even number of pages,
639 * which simplifies the mapping of address to metadata index.
640 */
641 for (i = 0; i < 2; i++) {
642 if (unlikely(!kfence_protect(addr)))
643 return addr;
644
645 addr += PAGE_SIZE;
646 }
647
648 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
649 struct kfence_metadata *meta = &kfence_metadata_init[i];
650
651 /* Initialize metadata. */
652 INIT_LIST_HEAD(&meta->list);
653 raw_spin_lock_init(&meta->lock);
654 meta->state = KFENCE_OBJECT_UNUSED;
655 /* Use addr to randomize the freelist. */
656 meta->addr = i;
657
658 /* Protect the right redzone. */
659 if (unlikely(!kfence_protect(addr + 2 * i * PAGE_SIZE + PAGE_SIZE)))
660 goto reset_slab;
661 }
662
663 for (i = CONFIG_KFENCE_NUM_OBJECTS; i > 0; i--) {
664 rand = get_random_u32_below(i);
665 swap(kfence_metadata_init[i - 1].addr, kfence_metadata_init[rand].addr);
666 }
667
668 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
669 struct kfence_metadata *meta_1 = &kfence_metadata_init[i];
670 struct kfence_metadata *meta_2 = &kfence_metadata_init[meta_1->addr];
671
672 list_add_tail(&meta_2->list, &kfence_freelist);
673 }
674 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
675 kfence_metadata_init[i].addr = addr;
676 addr += 2 * PAGE_SIZE;
677 }
678
679 /*
680 * Make kfence_metadata visible only when initialization is successful.
681 * Otherwise, if the initialization fails and kfence_metadata is freed,
682 * it may cause UAF in kfence_shutdown_cache().
683 */
684 smp_store_release(&kfence_metadata, kfence_metadata_init);
685 return 0;
686
687 reset_slab:
688 addr += 2 * i * PAGE_SIZE;
689 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
690 struct page *page;
691
692 if (!i || (i % 2))
693 continue;
694
695 page = pfn_to_page(start_pfn + i);
696 #ifdef CONFIG_MEMCG
697 struct slab *slab = page_slab(page);
698 slab->obj_exts = 0;
699 #endif
700 __ClearPageSlab(page);
701 }
702
703 return addr;
704 }
705
kfence_init_pool_early(void)706 static bool __init kfence_init_pool_early(void)
707 {
708 unsigned long addr;
709
710 if (!__kfence_pool)
711 return false;
712
713 addr = kfence_init_pool();
714
715 if (!addr) {
716 /*
717 * The pool is live and will never be deallocated from this point on.
718 * Ignore the pool object from the kmemleak phys object tree, as it would
719 * otherwise overlap with allocations returned by kfence_alloc(), which
720 * are registered with kmemleak through the slab post-alloc hook.
721 */
722 kmemleak_ignore_phys(__pa(__kfence_pool));
723 return true;
724 }
725
726 /*
727 * Only release unprotected pages, and do not try to go back and change
728 * page attributes due to risk of failing to do so as well. If changing
729 * page attributes for some pages fails, it is very likely that it also
730 * fails for the first page, and therefore expect addr==__kfence_pool in
731 * most failure cases.
732 */
733 memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
734 __kfence_pool = NULL;
735
736 memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
737 kfence_metadata_init = NULL;
738
739 return false;
740 }
741
742 /* === DebugFS Interface ==================================================== */
743
stats_show(struct seq_file * seq,void * v)744 static int stats_show(struct seq_file *seq, void *v)
745 {
746 int i;
747
748 seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
749 for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
750 seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
751
752 return 0;
753 }
754 DEFINE_SHOW_ATTRIBUTE(stats);
755
756 /*
757 * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
758 * start_object() and next_object() return the object index + 1, because NULL is used
759 * to stop iteration.
760 */
start_object(struct seq_file * seq,loff_t * pos)761 static void *start_object(struct seq_file *seq, loff_t *pos)
762 {
763 if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
764 return (void *)((long)*pos + 1);
765 return NULL;
766 }
767
stop_object(struct seq_file * seq,void * v)768 static void stop_object(struct seq_file *seq, void *v)
769 {
770 }
771
next_object(struct seq_file * seq,void * v,loff_t * pos)772 static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
773 {
774 ++*pos;
775 if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
776 return (void *)((long)*pos + 1);
777 return NULL;
778 }
779
show_object(struct seq_file * seq,void * v)780 static int show_object(struct seq_file *seq, void *v)
781 {
782 struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
783 unsigned long flags;
784
785 raw_spin_lock_irqsave(&meta->lock, flags);
786 kfence_print_object(seq, meta);
787 raw_spin_unlock_irqrestore(&meta->lock, flags);
788 seq_puts(seq, "---------------------------------\n");
789
790 return 0;
791 }
792
793 static const struct seq_operations objects_sops = {
794 .start = start_object,
795 .next = next_object,
796 .stop = stop_object,
797 .show = show_object,
798 };
799 DEFINE_SEQ_ATTRIBUTE(objects);
800
kfence_debugfs_init(void)801 static int kfence_debugfs_init(void)
802 {
803 struct dentry *kfence_dir;
804
805 if (!READ_ONCE(kfence_enabled))
806 return 0;
807
808 kfence_dir = debugfs_create_dir("kfence", NULL);
809 debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
810 debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
811 return 0;
812 }
813
814 late_initcall(kfence_debugfs_init);
815
816 /* === Panic Notifier ====================================================== */
817
kfence_check_all_canary(void)818 static void kfence_check_all_canary(void)
819 {
820 int i;
821
822 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
823 struct kfence_metadata *meta = &kfence_metadata[i];
824
825 if (kfence_obj_allocated(meta))
826 check_canary(meta);
827 }
828 }
829
kfence_check_canary_callback(struct notifier_block * nb,unsigned long reason,void * arg)830 static int kfence_check_canary_callback(struct notifier_block *nb,
831 unsigned long reason, void *arg)
832 {
833 kfence_check_all_canary();
834 return NOTIFY_OK;
835 }
836
837 static struct notifier_block kfence_check_canary_notifier = {
838 .notifier_call = kfence_check_canary_callback,
839 };
840
841 /* === Allocation Gate Timer ================================================ */
842
843 static struct delayed_work kfence_timer;
844
845 #ifdef CONFIG_KFENCE_STATIC_KEYS
846 /* Wait queue to wake up allocation-gate timer task. */
847 static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
848
kfence_reboot_callback(struct notifier_block * nb,unsigned long action,void * data)849 static int kfence_reboot_callback(struct notifier_block *nb,
850 unsigned long action, void *data)
851 {
852 /*
853 * Disable kfence to avoid static keys IPI synchronization during
854 * late shutdown/kexec
855 */
856 WRITE_ONCE(kfence_enabled, false);
857 /* Cancel any pending timer work */
858 cancel_delayed_work(&kfence_timer);
859 /*
860 * Wake up any blocked toggle_allocation_gate() so it can complete
861 * early while the system is still able to handle IPIs.
862 */
863 wake_up(&allocation_wait);
864
865 return NOTIFY_OK;
866 }
867
868 static struct notifier_block kfence_reboot_notifier = {
869 .notifier_call = kfence_reboot_callback,
870 .priority = INT_MAX, /* Run early to stop timers ASAP */
871 };
872
wake_up_kfence_timer(struct irq_work * work)873 static void wake_up_kfence_timer(struct irq_work *work)
874 {
875 wake_up(&allocation_wait);
876 }
877 static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
878 #endif
879
880 /*
881 * Set up delayed work, which will enable and disable the static key. We need to
882 * use a work queue (rather than a simple timer), since enabling and disabling a
883 * static key cannot be done from an interrupt.
884 *
885 * Note: Toggling a static branch currently causes IPIs, and here we'll end up
886 * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
887 * more aggressive sampling intervals), we could get away with a variant that
888 * avoids IPIs, at the cost of not immediately capturing allocations if the
889 * instructions remain cached.
890 */
toggle_allocation_gate(struct work_struct * work)891 static void toggle_allocation_gate(struct work_struct *work)
892 {
893 if (!READ_ONCE(kfence_enabled))
894 return;
895
896 atomic_set(&kfence_allocation_gate, -kfence_burst);
897 #ifdef CONFIG_KFENCE_STATIC_KEYS
898 /* Enable static key, and await allocation to happen. */
899 static_branch_enable(&kfence_allocation_key);
900
901 wait_event_idle(allocation_wait,
902 atomic_read(&kfence_allocation_gate) > 0 ||
903 !READ_ONCE(kfence_enabled));
904
905 /* Disable static key and reset timer. */
906 static_branch_disable(&kfence_allocation_key);
907 #endif
908 queue_delayed_work(system_dfl_wq, &kfence_timer,
909 msecs_to_jiffies(kfence_sample_interval));
910 }
911
912 /* === Public interface ===================================================== */
913
kfence_alloc_pool_and_metadata(void)914 void __init kfence_alloc_pool_and_metadata(void)
915 {
916 if (!kfence_sample_interval)
917 return;
918
919 /*
920 * If the pool has already been initialized by arch, there is no need to
921 * re-allocate the memory pool.
922 */
923 if (!__kfence_pool)
924 __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
925
926 if (!__kfence_pool) {
927 pr_err("failed to allocate pool\n");
928 return;
929 }
930
931 /* The memory allocated by memblock has been zeroed out. */
932 kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
933 if (!kfence_metadata_init) {
934 pr_err("failed to allocate metadata\n");
935 memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
936 __kfence_pool = NULL;
937 }
938 }
939
kfence_init_enable(void)940 static void kfence_init_enable(void)
941 {
942 if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
943 static_branch_enable(&kfence_allocation_key);
944
945 if (kfence_deferrable)
946 INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
947 else
948 INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
949
950 if (kfence_check_on_panic)
951 atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
952
953 #ifdef CONFIG_KFENCE_STATIC_KEYS
954 register_reboot_notifier(&kfence_reboot_notifier);
955 #endif
956
957 WRITE_ONCE(kfence_enabled, true);
958 queue_delayed_work(system_dfl_wq, &kfence_timer, 0);
959
960 pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
961 CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
962 (void *)(__kfence_pool + KFENCE_POOL_SIZE));
963 }
964
kfence_init(void)965 void __init kfence_init(void)
966 {
967 stack_hash_seed = get_random_u32();
968
969 /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
970 if (!kfence_sample_interval)
971 return;
972
973 if (!kfence_init_pool_early()) {
974 pr_err("%s failed\n", __func__);
975 return;
976 }
977
978 kfence_init_enable();
979 }
980
kfence_init_late(void)981 static int kfence_init_late(void)
982 {
983 const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
984 const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
985 unsigned long addr = (unsigned long)__kfence_pool;
986 unsigned long free_size = KFENCE_POOL_SIZE;
987 int err = -ENOMEM;
988
989 #ifdef CONFIG_CONTIG_ALLOC
990 struct page *pages;
991
992 pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
993 NULL);
994 if (!pages)
995 return -ENOMEM;
996
997 __kfence_pool = page_to_virt(pages);
998 pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
999 NULL);
1000 if (pages)
1001 kfence_metadata_init = page_to_virt(pages);
1002 #else
1003 if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
1004 nr_pages_meta > MAX_ORDER_NR_PAGES) {
1005 pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
1006 return -EINVAL;
1007 }
1008
1009 __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
1010 if (!__kfence_pool)
1011 return -ENOMEM;
1012
1013 kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
1014 #endif
1015
1016 if (!kfence_metadata_init)
1017 goto free_pool;
1018
1019 memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
1020 addr = kfence_init_pool();
1021 if (!addr) {
1022 kfence_init_enable();
1023 kfence_debugfs_init();
1024 return 0;
1025 }
1026
1027 pr_err("%s failed\n", __func__);
1028 free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
1029 err = -EBUSY;
1030
1031 #ifdef CONFIG_CONTIG_ALLOC
1032 free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
1033 nr_pages_meta);
1034 free_pool:
1035 free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
1036 free_size / PAGE_SIZE);
1037 #else
1038 free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
1039 free_pool:
1040 free_pages_exact((void *)addr, free_size);
1041 #endif
1042
1043 kfence_metadata_init = NULL;
1044 __kfence_pool = NULL;
1045 return err;
1046 }
1047
kfence_enable_late(void)1048 static int kfence_enable_late(void)
1049 {
1050 if (!__kfence_pool)
1051 return kfence_init_late();
1052
1053 WRITE_ONCE(kfence_enabled, true);
1054 queue_delayed_work(system_dfl_wq, &kfence_timer, 0);
1055 pr_info("re-enabled\n");
1056 return 0;
1057 }
1058
kfence_shutdown_cache(struct kmem_cache * s)1059 void kfence_shutdown_cache(struct kmem_cache *s)
1060 {
1061 unsigned long flags;
1062 struct kfence_metadata *meta;
1063 int i;
1064
1065 /* Pairs with release in kfence_init_pool(). */
1066 if (!smp_load_acquire(&kfence_metadata))
1067 return;
1068
1069 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
1070 bool in_use;
1071
1072 meta = &kfence_metadata[i];
1073
1074 /*
1075 * If we observe some inconsistent cache and state pair where we
1076 * should have returned false here, cache destruction is racing
1077 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
1078 * the lock will not help, as different critical section
1079 * serialization will have the same outcome.
1080 */
1081 if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta))
1082 continue;
1083
1084 raw_spin_lock_irqsave(&meta->lock, flags);
1085 in_use = meta->cache == s && kfence_obj_allocated(meta);
1086 raw_spin_unlock_irqrestore(&meta->lock, flags);
1087
1088 if (in_use) {
1089 /*
1090 * This cache still has allocations, and we should not
1091 * release them back into the freelist so they can still
1092 * safely be used and retain the kernel's default
1093 * behaviour of keeping the allocations alive (leak the
1094 * cache); however, they effectively become "zombie
1095 * allocations" as the KFENCE objects are the only ones
1096 * still in use and the owning cache is being destroyed.
1097 *
1098 * We mark them freed, so that any subsequent use shows
1099 * more useful error messages that will include stack
1100 * traces of the user of the object, the original
1101 * allocation, and caller to shutdown_cache().
1102 */
1103 kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
1104 }
1105 }
1106
1107 for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
1108 meta = &kfence_metadata[i];
1109
1110 /* See above. */
1111 if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
1112 continue;
1113
1114 raw_spin_lock_irqsave(&meta->lock, flags);
1115 if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
1116 meta->cache = NULL;
1117 raw_spin_unlock_irqrestore(&meta->lock, flags);
1118 }
1119 }
1120
__kfence_alloc(struct kmem_cache * s,size_t size,gfp_t flags)1121 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
1122 {
1123 unsigned long stack_entries[KFENCE_STACK_DEPTH];
1124 size_t num_stack_entries;
1125 u32 alloc_stack_hash;
1126 int allocation_gate;
1127
1128 /*
1129 * Perform size check before switching kfence_allocation_gate, so that
1130 * we don't disable KFENCE without making an allocation.
1131 */
1132 if (size > PAGE_SIZE) {
1133 atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
1134 return NULL;
1135 }
1136
1137 /*
1138 * Skip allocations from non-default zones, including DMA. We cannot
1139 * guarantee that pages in the KFENCE pool will have the requested
1140 * properties (e.g. reside in DMAable memory).
1141 */
1142 if ((flags & GFP_ZONEMASK) ||
1143 ((flags & __GFP_THISNODE) && num_online_nodes() > 1) ||
1144 (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
1145 atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
1146 return NULL;
1147 }
1148
1149 /*
1150 * Skip allocations for this slab, if KFENCE has been disabled for
1151 * this slab.
1152 */
1153 if (s->flags & SLAB_SKIP_KFENCE)
1154 return NULL;
1155
1156 allocation_gate = atomic_inc_return(&kfence_allocation_gate);
1157 if (allocation_gate > 1)
1158 return NULL;
1159 #ifdef CONFIG_KFENCE_STATIC_KEYS
1160 /*
1161 * waitqueue_active() is fully ordered after the update of
1162 * kfence_allocation_gate per atomic_inc_return().
1163 */
1164 if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) {
1165 /*
1166 * Calling wake_up() here may deadlock when allocations happen
1167 * from within timer code. Use an irq_work to defer it.
1168 */
1169 irq_work_queue(&wake_up_kfence_timer_work);
1170 }
1171 #endif
1172
1173 if (!READ_ONCE(kfence_enabled))
1174 return NULL;
1175
1176 num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
1177
1178 /*
1179 * Do expensive check for coverage of allocation in slow-path after
1180 * allocation_gate has already become non-zero, even though it might
1181 * mean not making any allocation within a given sample interval.
1182 *
1183 * This ensures reasonable allocation coverage when the pool is almost
1184 * full, including avoiding long-lived allocations of the same source
1185 * filling up the pool (e.g. pagecache allocations).
1186 */
1187 alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
1188 if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
1189 atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
1190 return NULL;
1191 }
1192
1193 return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
1194 alloc_stack_hash);
1195 }
1196
kfence_ksize(const void * addr)1197 size_t kfence_ksize(const void *addr)
1198 {
1199 const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1200
1201 /*
1202 * Read locklessly -- if there is a race with __kfence_alloc(), this is
1203 * either a use-after-free or invalid access.
1204 */
1205 return meta ? meta->size : 0;
1206 }
1207
kfence_object_start(const void * addr)1208 void *kfence_object_start(const void *addr)
1209 {
1210 const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1211
1212 /*
1213 * Read locklessly -- if there is a race with __kfence_alloc(), this is
1214 * either a use-after-free or invalid access.
1215 */
1216 return meta ? (void *)meta->addr : NULL;
1217 }
1218
__kfence_free(void * addr)1219 void __kfence_free(void *addr)
1220 {
1221 struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
1222
1223 #ifdef CONFIG_MEMCG
1224 KFENCE_WARN_ON(meta->obj_exts.objcg);
1225 #endif
1226 /*
1227 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
1228 * the object, as the object page may be recycled for other-typed
1229 * objects once it has been freed. meta->cache may be NULL if the cache
1230 * was destroyed.
1231 * Save the stack trace here so that reports show where the user freed
1232 * the object.
1233 */
1234 if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) {
1235 unsigned long flags;
1236
1237 raw_spin_lock_irqsave(&meta->lock, flags);
1238 metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0);
1239 raw_spin_unlock_irqrestore(&meta->lock, flags);
1240 call_rcu(&meta->rcu_head, rcu_guarded_free);
1241 } else {
1242 kfence_guarded_free(addr, meta, false);
1243 }
1244 }
1245
kfence_handle_page_fault(unsigned long addr,bool is_write,struct pt_regs * regs)1246 bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
1247 {
1248 const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
1249 struct kfence_metadata *to_report = NULL;
1250 unsigned long unprotected_page = 0;
1251 enum kfence_error_type error_type;
1252 unsigned long flags;
1253
1254 if (!is_kfence_address((void *)addr))
1255 return false;
1256
1257 if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
1258 return kfence_unprotect(addr); /* ... unprotect and proceed. */
1259
1260 atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
1261
1262 if (page_index % 2) {
1263 /* This is a redzone, report a buffer overflow. */
1264 struct kfence_metadata *meta;
1265 int distance = 0;
1266
1267 meta = addr_to_metadata(addr - PAGE_SIZE);
1268 if (meta && kfence_obj_allocated(meta)) {
1269 to_report = meta;
1270 /* Data race ok; distance calculation approximate. */
1271 distance = addr - data_race(meta->addr + meta->size);
1272 }
1273
1274 meta = addr_to_metadata(addr + PAGE_SIZE);
1275 if (meta && kfence_obj_allocated(meta)) {
1276 /* Data race ok; distance calculation approximate. */
1277 if (!to_report || distance > data_race(meta->addr) - addr)
1278 to_report = meta;
1279 }
1280
1281 if (!to_report)
1282 goto out;
1283
1284 error_type = KFENCE_ERROR_OOB;
1285 unprotected_page = addr;
1286
1287 /*
1288 * If the object was freed before we took the look we can still
1289 * report this as an OOB -- the report will simply show the
1290 * stacktrace of the free as well.
1291 */
1292 } else {
1293 to_report = addr_to_metadata(addr);
1294 if (!to_report)
1295 goto out;
1296
1297 error_type = KFENCE_ERROR_UAF;
1298 /*
1299 * We may race with __kfence_alloc(), and it is possible that a
1300 * freed object may be reallocated. We simply report this as a
1301 * use-after-free, with the stack trace showing the place where
1302 * the object was re-allocated.
1303 */
1304 }
1305
1306 out:
1307 if (to_report) {
1308 raw_spin_lock_irqsave(&to_report->lock, flags);
1309 to_report->unprotected_page = unprotected_page;
1310 kfence_report_error(addr, is_write, regs, to_report, error_type);
1311 raw_spin_unlock_irqrestore(&to_report->lock, flags);
1312 } else {
1313 /* This may be a UAF or OOB access, but we can't be sure. */
1314 kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
1315 }
1316
1317 return kfence_unprotect(addr); /* Unprotect and let access proceed. */
1318 }
1319