10ce20dd8SAlexander Potapenko // SPDX-License-Identifier: GPL-2.0 20ce20dd8SAlexander Potapenko /* 30ce20dd8SAlexander Potapenko * KFENCE guarded object allocator and fault handling. 40ce20dd8SAlexander Potapenko * 50ce20dd8SAlexander Potapenko * Copyright (C) 2020, Google LLC. 60ce20dd8SAlexander Potapenko */ 70ce20dd8SAlexander Potapenko 80ce20dd8SAlexander Potapenko #define pr_fmt(fmt) "kfence: " fmt 90ce20dd8SAlexander Potapenko 100ce20dd8SAlexander Potapenko #include <linux/atomic.h> 110ce20dd8SAlexander Potapenko #include <linux/bug.h> 120ce20dd8SAlexander Potapenko #include <linux/debugfs.h> 1308f6b106SMarco Elver #include <linux/hash.h> 14407f1d8cSMarco Elver #include <linux/irq_work.h> 1508f6b106SMarco Elver #include <linux/jhash.h> 160ce20dd8SAlexander Potapenko #include <linux/kcsan-checks.h> 170ce20dd8SAlexander Potapenko #include <linux/kfence.h> 1895511580SMarco Elver #include <linux/kmemleak.h> 190ce20dd8SAlexander Potapenko #include <linux/list.h> 200ce20dd8SAlexander Potapenko #include <linux/lockdep.h> 2108f6b106SMarco Elver #include <linux/log2.h> 220ce20dd8SAlexander Potapenko #include <linux/memblock.h> 230ce20dd8SAlexander Potapenko #include <linux/moduleparam.h> 243c81b3bbShuangshaobo #include <linux/notifier.h> 253c81b3bbShuangshaobo #include <linux/panic_notifier.h> 260ce20dd8SAlexander Potapenko #include <linux/random.h> 270ce20dd8SAlexander Potapenko #include <linux/rcupdate.h> 284bbf04aaSMarco Elver #include <linux/sched/clock.h> 290ce20dd8SAlexander Potapenko #include <linux/seq_file.h> 300ce20dd8SAlexander Potapenko #include <linux/slab.h> 310ce20dd8SAlexander Potapenko #include <linux/spinlock.h> 320ce20dd8SAlexander Potapenko #include <linux/string.h> 330ce20dd8SAlexander Potapenko 340ce20dd8SAlexander Potapenko #include <asm/kfence.h> 350ce20dd8SAlexander Potapenko 360ce20dd8SAlexander Potapenko #include "kfence.h" 370ce20dd8SAlexander Potapenko 380ce20dd8SAlexander Potapenko /* Disables KFENCE on the first warning assuming an irrecoverable error. */ 390ce20dd8SAlexander Potapenko #define KFENCE_WARN_ON(cond) \ 400ce20dd8SAlexander Potapenko ({ \ 410ce20dd8SAlexander Potapenko const bool __cond = WARN_ON(cond); \ 42698361bcSTianchen Ding if (unlikely(__cond)) { \ 430ce20dd8SAlexander Potapenko WRITE_ONCE(kfence_enabled, false); \ 44698361bcSTianchen Ding disabled_by_warn = true; \ 45698361bcSTianchen Ding } \ 460ce20dd8SAlexander Potapenko __cond; \ 470ce20dd8SAlexander Potapenko }) 480ce20dd8SAlexander Potapenko 490ce20dd8SAlexander Potapenko /* === Data ================================================================= */ 500ce20dd8SAlexander Potapenko 510ce20dd8SAlexander Potapenko static bool kfence_enabled __read_mostly; 52698361bcSTianchen Ding static bool disabled_by_warn __read_mostly; 530ce20dd8SAlexander Potapenko 548913c610SPeng Liu unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; 558913c610SPeng Liu EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ 560ce20dd8SAlexander Potapenko 570ce20dd8SAlexander Potapenko #ifdef MODULE_PARAM_PREFIX 580ce20dd8SAlexander Potapenko #undef MODULE_PARAM_PREFIX 590ce20dd8SAlexander Potapenko #endif 600ce20dd8SAlexander Potapenko #define MODULE_PARAM_PREFIX "kfence." 610ce20dd8SAlexander Potapenko 62698361bcSTianchen Ding static int kfence_enable_late(void); 630ce20dd8SAlexander Potapenko static int param_set_sample_interval(const char *val, const struct kernel_param *kp) 640ce20dd8SAlexander Potapenko { 650ce20dd8SAlexander Potapenko unsigned long num; 660ce20dd8SAlexander Potapenko int ret = kstrtoul(val, 0, &num); 670ce20dd8SAlexander Potapenko 680ce20dd8SAlexander Potapenko if (ret < 0) 690ce20dd8SAlexander Potapenko return ret; 700ce20dd8SAlexander Potapenko 7183d7d04fSJackie Liu /* Using 0 to indicate KFENCE is disabled. */ 7283d7d04fSJackie Liu if (!num && READ_ONCE(kfence_enabled)) { 7383d7d04fSJackie Liu pr_info("disabled\n"); 740ce20dd8SAlexander Potapenko WRITE_ONCE(kfence_enabled, false); 7583d7d04fSJackie Liu } 760ce20dd8SAlexander Potapenko 770ce20dd8SAlexander Potapenko *((unsigned long *)kp->arg) = num; 78698361bcSTianchen Ding 79698361bcSTianchen Ding if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) 80698361bcSTianchen Ding return disabled_by_warn ? -EINVAL : kfence_enable_late(); 810ce20dd8SAlexander Potapenko return 0; 820ce20dd8SAlexander Potapenko } 830ce20dd8SAlexander Potapenko 840ce20dd8SAlexander Potapenko static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) 850ce20dd8SAlexander Potapenko { 860ce20dd8SAlexander Potapenko if (!READ_ONCE(kfence_enabled)) 870ce20dd8SAlexander Potapenko return sprintf(buffer, "0\n"); 880ce20dd8SAlexander Potapenko 890ce20dd8SAlexander Potapenko return param_get_ulong(buffer, kp); 900ce20dd8SAlexander Potapenko } 910ce20dd8SAlexander Potapenko 920ce20dd8SAlexander Potapenko static const struct kernel_param_ops sample_interval_param_ops = { 930ce20dd8SAlexander Potapenko .set = param_set_sample_interval, 940ce20dd8SAlexander Potapenko .get = param_get_sample_interval, 950ce20dd8SAlexander Potapenko }; 960ce20dd8SAlexander Potapenko module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); 970ce20dd8SAlexander Potapenko 9808f6b106SMarco Elver /* Pool usage% threshold when currently covered allocations are skipped. */ 9908f6b106SMarco Elver static unsigned long kfence_skip_covered_thresh __read_mostly = 75; 10008f6b106SMarco Elver module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); 10108f6b106SMarco Elver 102737b6a10SMarco Elver /* If true, use a deferrable timer. */ 103737b6a10SMarco Elver static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); 104737b6a10SMarco Elver module_param_named(deferrable, kfence_deferrable, bool, 0444); 105737b6a10SMarco Elver 1063c81b3bbShuangshaobo /* If true, check all canary bytes on panic. */ 1073c81b3bbShuangshaobo static bool kfence_check_on_panic __read_mostly; 1083c81b3bbShuangshaobo module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); 1093c81b3bbShuangshaobo 1100ce20dd8SAlexander Potapenko /* The pool of pages used for guard pages and objects. */ 111b33f778bSTianchen Ding char *__kfence_pool __read_mostly; 1120ce20dd8SAlexander Potapenko EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ 1130ce20dd8SAlexander Potapenko 1140ce20dd8SAlexander Potapenko /* 1150ce20dd8SAlexander Potapenko * Per-object metadata, with one-to-one mapping of object metadata to 1160ce20dd8SAlexander Potapenko * backing pages (in __kfence_pool). 1170ce20dd8SAlexander Potapenko */ 1180ce20dd8SAlexander Potapenko static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); 119cabdf74eSPeng Zhang struct kfence_metadata *kfence_metadata __read_mostly; 120cabdf74eSPeng Zhang 121cabdf74eSPeng Zhang /* 122cabdf74eSPeng Zhang * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). 123cabdf74eSPeng Zhang * So introduce kfence_metadata_init to initialize metadata, and then make 124cabdf74eSPeng Zhang * kfence_metadata visible after initialization is successful. This prevents 125cabdf74eSPeng Zhang * potential UAF or access to uninitialized metadata. 126cabdf74eSPeng Zhang */ 127cabdf74eSPeng Zhang static struct kfence_metadata *kfence_metadata_init __read_mostly; 1280ce20dd8SAlexander Potapenko 1290ce20dd8SAlexander Potapenko /* Freelist with available objects. */ 1300ce20dd8SAlexander Potapenko static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); 1310ce20dd8SAlexander Potapenko static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ 1320ce20dd8SAlexander Potapenko 13307e8481dSMarco Elver /* 13407e8481dSMarco Elver * The static key to set up a KFENCE allocation; or if static keys are not used 13507e8481dSMarco Elver * to gate allocations, to avoid a load and compare if KFENCE is disabled. 13607e8481dSMarco Elver */ 1370ce20dd8SAlexander Potapenko DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); 1380ce20dd8SAlexander Potapenko 1390ce20dd8SAlexander Potapenko /* Gates the allocation, ensuring only one succeeds in a given period. */ 1400ce20dd8SAlexander Potapenko atomic_t kfence_allocation_gate = ATOMIC_INIT(1); 1410ce20dd8SAlexander Potapenko 14208f6b106SMarco Elver /* 14308f6b106SMarco Elver * A Counting Bloom filter of allocation coverage: limits currently covered 14408f6b106SMarco Elver * allocations of the same source filling up the pool. 14508f6b106SMarco Elver * 14608f6b106SMarco Elver * Assuming a range of 15%-85% unique allocations in the pool at any point in 14708f6b106SMarco Elver * time, the below parameters provide a probablity of 0.02-0.33 for false 14808f6b106SMarco Elver * positive hits respectively: 14908f6b106SMarco Elver * 15008f6b106SMarco Elver * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM 15108f6b106SMarco Elver */ 15208f6b106SMarco Elver #define ALLOC_COVERED_HNUM 2 15308f6b106SMarco Elver #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) 15408f6b106SMarco Elver #define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) 15508f6b106SMarco Elver #define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) 15608f6b106SMarco Elver #define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) 15708f6b106SMarco Elver static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; 15808f6b106SMarco Elver 15908f6b106SMarco Elver /* Stack depth used to determine uniqueness of an allocation. */ 16008f6b106SMarco Elver #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) 16108f6b106SMarco Elver 16208f6b106SMarco Elver /* 16308f6b106SMarco Elver * Randomness for stack hashes, making the same collisions across reboots and 16408f6b106SMarco Elver * different machines less likely. 16508f6b106SMarco Elver */ 16608f6b106SMarco Elver static u32 stack_hash_seed __ro_after_init; 16708f6b106SMarco Elver 1680ce20dd8SAlexander Potapenko /* Statistics counters for debugfs. */ 1690ce20dd8SAlexander Potapenko enum kfence_counter_id { 1700ce20dd8SAlexander Potapenko KFENCE_COUNTER_ALLOCATED, 1710ce20dd8SAlexander Potapenko KFENCE_COUNTER_ALLOCS, 1720ce20dd8SAlexander Potapenko KFENCE_COUNTER_FREES, 1730ce20dd8SAlexander Potapenko KFENCE_COUNTER_ZOMBIES, 1740ce20dd8SAlexander Potapenko KFENCE_COUNTER_BUGS, 1759a19aeb5SMarco Elver KFENCE_COUNTER_SKIP_INCOMPAT, 1769a19aeb5SMarco Elver KFENCE_COUNTER_SKIP_CAPACITY, 17708f6b106SMarco Elver KFENCE_COUNTER_SKIP_COVERED, 1780ce20dd8SAlexander Potapenko KFENCE_COUNTER_COUNT, 1790ce20dd8SAlexander Potapenko }; 1800ce20dd8SAlexander Potapenko static atomic_long_t counters[KFENCE_COUNTER_COUNT]; 1810ce20dd8SAlexander Potapenko static const char *const counter_names[] = { 1820ce20dd8SAlexander Potapenko [KFENCE_COUNTER_ALLOCATED] = "currently allocated", 1830ce20dd8SAlexander Potapenko [KFENCE_COUNTER_ALLOCS] = "total allocations", 1840ce20dd8SAlexander Potapenko [KFENCE_COUNTER_FREES] = "total frees", 1850ce20dd8SAlexander Potapenko [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", 1860ce20dd8SAlexander Potapenko [KFENCE_COUNTER_BUGS] = "total bugs", 1879a19aeb5SMarco Elver [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", 1889a19aeb5SMarco Elver [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", 18908f6b106SMarco Elver [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", 1900ce20dd8SAlexander Potapenko }; 1910ce20dd8SAlexander Potapenko static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); 1920ce20dd8SAlexander Potapenko 1930ce20dd8SAlexander Potapenko /* === Internals ============================================================ */ 1940ce20dd8SAlexander Potapenko 19508f6b106SMarco Elver static inline bool should_skip_covered(void) 19608f6b106SMarco Elver { 19708f6b106SMarco Elver unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; 19808f6b106SMarco Elver 19908f6b106SMarco Elver return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; 20008f6b106SMarco Elver } 20108f6b106SMarco Elver 20208f6b106SMarco Elver static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) 20308f6b106SMarco Elver { 20408f6b106SMarco Elver num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); 20508f6b106SMarco Elver num_entries = filter_irq_stacks(stack_entries, num_entries); 20608f6b106SMarco Elver return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); 20708f6b106SMarco Elver } 20808f6b106SMarco Elver 20908f6b106SMarco Elver /* 21008f6b106SMarco Elver * Adds (or subtracts) count @val for allocation stack trace hash 21108f6b106SMarco Elver * @alloc_stack_hash from Counting Bloom filter. 21208f6b106SMarco Elver */ 21308f6b106SMarco Elver static void alloc_covered_add(u32 alloc_stack_hash, int val) 21408f6b106SMarco Elver { 21508f6b106SMarco Elver int i; 21608f6b106SMarco Elver 21708f6b106SMarco Elver for (i = 0; i < ALLOC_COVERED_HNUM; i++) { 21808f6b106SMarco Elver atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); 21908f6b106SMarco Elver alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); 22008f6b106SMarco Elver } 22108f6b106SMarco Elver } 22208f6b106SMarco Elver 22308f6b106SMarco Elver /* 22408f6b106SMarco Elver * Returns true if the allocation stack trace hash @alloc_stack_hash is 22508f6b106SMarco Elver * currently contained (non-zero count) in Counting Bloom filter. 22608f6b106SMarco Elver */ 22708f6b106SMarco Elver static bool alloc_covered_contains(u32 alloc_stack_hash) 22808f6b106SMarco Elver { 22908f6b106SMarco Elver int i; 23008f6b106SMarco Elver 23108f6b106SMarco Elver for (i = 0; i < ALLOC_COVERED_HNUM; i++) { 23208f6b106SMarco Elver if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) 23308f6b106SMarco Elver return false; 23408f6b106SMarco Elver alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); 23508f6b106SMarco Elver } 23608f6b106SMarco Elver 23708f6b106SMarco Elver return true; 23808f6b106SMarco Elver } 23908f6b106SMarco Elver 2400ce20dd8SAlexander Potapenko static bool kfence_protect(unsigned long addr) 2410ce20dd8SAlexander Potapenko { 2420ce20dd8SAlexander Potapenko return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); 2430ce20dd8SAlexander Potapenko } 2440ce20dd8SAlexander Potapenko 2450ce20dd8SAlexander Potapenko static bool kfence_unprotect(unsigned long addr) 2460ce20dd8SAlexander Potapenko { 2470ce20dd8SAlexander Potapenko return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); 2480ce20dd8SAlexander Potapenko } 2490ce20dd8SAlexander Potapenko 2500ce20dd8SAlexander Potapenko static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) 2510ce20dd8SAlexander Potapenko { 2520ce20dd8SAlexander Potapenko unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; 2530ce20dd8SAlexander Potapenko unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; 2540ce20dd8SAlexander Potapenko 2550ce20dd8SAlexander Potapenko /* The checks do not affect performance; only called from slow-paths. */ 2560ce20dd8SAlexander Potapenko 2570ce20dd8SAlexander Potapenko /* Only call with a pointer into kfence_metadata. */ 2580ce20dd8SAlexander Potapenko if (KFENCE_WARN_ON(meta < kfence_metadata || 2590ce20dd8SAlexander Potapenko meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) 2600ce20dd8SAlexander Potapenko return 0; 2610ce20dd8SAlexander Potapenko 2620ce20dd8SAlexander Potapenko /* 2630ce20dd8SAlexander Potapenko * This metadata object only ever maps to 1 page; verify that the stored 2640ce20dd8SAlexander Potapenko * address is in the expected range. 2650ce20dd8SAlexander Potapenko */ 2660ce20dd8SAlexander Potapenko if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) 2670ce20dd8SAlexander Potapenko return 0; 2680ce20dd8SAlexander Potapenko 2690ce20dd8SAlexander Potapenko return pageaddr; 2700ce20dd8SAlexander Potapenko } 2710ce20dd8SAlexander Potapenko 2720ce20dd8SAlexander Potapenko /* 2730ce20dd8SAlexander Potapenko * Update the object's metadata state, including updating the alloc/free stacks 2740ce20dd8SAlexander Potapenko * depending on the state transition. 2750ce20dd8SAlexander Potapenko */ 276a9ab52bbSMarco Elver static noinline void 277a9ab52bbSMarco Elver metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, 278a9ab52bbSMarco Elver unsigned long *stack_entries, size_t num_stack_entries) 2790ce20dd8SAlexander Potapenko { 2800ce20dd8SAlexander Potapenko struct kfence_track *track = 2810ce20dd8SAlexander Potapenko next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; 2820ce20dd8SAlexander Potapenko 2830ce20dd8SAlexander Potapenko lockdep_assert_held(&meta->lock); 2840ce20dd8SAlexander Potapenko 285a9ab52bbSMarco Elver if (stack_entries) { 286a9ab52bbSMarco Elver memcpy(track->stack_entries, stack_entries, 287a9ab52bbSMarco Elver num_stack_entries * sizeof(stack_entries[0])); 288a9ab52bbSMarco Elver } else { 2890ce20dd8SAlexander Potapenko /* 290a9ab52bbSMarco Elver * Skip over 1 (this) functions; noinline ensures we do not 291a9ab52bbSMarco Elver * accidentally skip over the caller by never inlining. 2920ce20dd8SAlexander Potapenko */ 293a9ab52bbSMarco Elver num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); 294a9ab52bbSMarco Elver } 295a9ab52bbSMarco Elver track->num_stack_entries = num_stack_entries; 2960ce20dd8SAlexander Potapenko track->pid = task_pid_nr(current); 2974bbf04aaSMarco Elver track->cpu = raw_smp_processor_id(); 2984bbf04aaSMarco Elver track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ 2990ce20dd8SAlexander Potapenko 3000ce20dd8SAlexander Potapenko /* 3010ce20dd8SAlexander Potapenko * Pairs with READ_ONCE() in 3020ce20dd8SAlexander Potapenko * kfence_shutdown_cache(), 3030ce20dd8SAlexander Potapenko * kfence_handle_page_fault(). 3040ce20dd8SAlexander Potapenko */ 3050ce20dd8SAlexander Potapenko WRITE_ONCE(meta->state, next); 3060ce20dd8SAlexander Potapenko } 3070ce20dd8SAlexander Potapenko 3084d7b5a2cSIlya Leoshkevich #ifdef CONFIG_KMSAN 3094d7b5a2cSIlya Leoshkevich #define check_canary_attributes noinline __no_kmsan_checks 3104d7b5a2cSIlya Leoshkevich #else 3114d7b5a2cSIlya Leoshkevich #define check_canary_attributes inline 3124d7b5a2cSIlya Leoshkevich #endif 3134d7b5a2cSIlya Leoshkevich 3140ce20dd8SAlexander Potapenko /* Check canary byte at @addr. */ 3154d7b5a2cSIlya Leoshkevich static check_canary_attributes bool check_canary_byte(u8 *addr) 3160ce20dd8SAlexander Potapenko { 31749332956SMarco Elver struct kfence_metadata *meta; 31849332956SMarco Elver unsigned long flags; 31949332956SMarco Elver 3201ba3cbf3SPeng Zhang if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) 3210ce20dd8SAlexander Potapenko return true; 3220ce20dd8SAlexander Potapenko 3230ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); 32449332956SMarco Elver 32549332956SMarco Elver meta = addr_to_metadata((unsigned long)addr); 32649332956SMarco Elver raw_spin_lock_irqsave(&meta->lock, flags); 32749332956SMarco Elver kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); 32849332956SMarco Elver raw_spin_unlock_irqrestore(&meta->lock, flags); 32949332956SMarco Elver 3300ce20dd8SAlexander Potapenko return false; 3310ce20dd8SAlexander Potapenko } 3320ce20dd8SAlexander Potapenko 3331ba3cbf3SPeng Zhang static inline void set_canary(const struct kfence_metadata *meta) 3340ce20dd8SAlexander Potapenko { 3350ce20dd8SAlexander Potapenko const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); 3361ba3cbf3SPeng Zhang unsigned long addr = pageaddr; 3370ce20dd8SAlexander Potapenko 3380ce20dd8SAlexander Potapenko /* 3391ba3cbf3SPeng Zhang * The canary may be written to part of the object memory, but it does 3401ba3cbf3SPeng Zhang * not affect it. The user should initialize the object before using it. 3411ba3cbf3SPeng Zhang */ 3421ba3cbf3SPeng Zhang for (; addr < meta->addr; addr += sizeof(u64)) 3431ba3cbf3SPeng Zhang *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; 3441ba3cbf3SPeng Zhang 3451ba3cbf3SPeng Zhang addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64)); 3461ba3cbf3SPeng Zhang for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) 3471ba3cbf3SPeng Zhang *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; 3481ba3cbf3SPeng Zhang } 3491ba3cbf3SPeng Zhang 3504d7b5a2cSIlya Leoshkevich static check_canary_attributes void 3514d7b5a2cSIlya Leoshkevich check_canary(const struct kfence_metadata *meta) 3521ba3cbf3SPeng Zhang { 3531ba3cbf3SPeng Zhang const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); 3541ba3cbf3SPeng Zhang unsigned long addr = pageaddr; 3551ba3cbf3SPeng Zhang 3561ba3cbf3SPeng Zhang /* 3571ba3cbf3SPeng Zhang * We'll iterate over each canary byte per-side until a corrupted byte 3581ba3cbf3SPeng Zhang * is found. However, we'll still iterate over the canary bytes to the 3590ce20dd8SAlexander Potapenko * right of the object even if there was an error in the canary bytes to 3600ce20dd8SAlexander Potapenko * the left of the object. Specifically, if check_canary_byte() 3610ce20dd8SAlexander Potapenko * generates an error, showing both sides might give more clues as to 3620ce20dd8SAlexander Potapenko * what the error is about when displaying which bytes were corrupted. 3630ce20dd8SAlexander Potapenko */ 3640ce20dd8SAlexander Potapenko 3650ce20dd8SAlexander Potapenko /* Apply to left of object. */ 3661ba3cbf3SPeng Zhang for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) { 3671ba3cbf3SPeng Zhang if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) 3681ba3cbf3SPeng Zhang break; 3691ba3cbf3SPeng Zhang } 3701ba3cbf3SPeng Zhang 3711ba3cbf3SPeng Zhang /* 3721ba3cbf3SPeng Zhang * If the canary is corrupted in a certain 64 bytes, or the canary 3731ba3cbf3SPeng Zhang * memory cannot be completely covered by multiple consecutive 64 bytes, 3741ba3cbf3SPeng Zhang * it needs to be checked one by one. 3751ba3cbf3SPeng Zhang */ 3761ba3cbf3SPeng Zhang for (; addr < meta->addr; addr++) { 3771ba3cbf3SPeng Zhang if (unlikely(!check_canary_byte((u8 *)addr))) 3780ce20dd8SAlexander Potapenko break; 3790ce20dd8SAlexander Potapenko } 3800ce20dd8SAlexander Potapenko 3810ce20dd8SAlexander Potapenko /* Apply to right of object. */ 3821ba3cbf3SPeng Zhang for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) { 3831ba3cbf3SPeng Zhang if (unlikely(!check_canary_byte((u8 *)addr))) 3841ba3cbf3SPeng Zhang return; 3851ba3cbf3SPeng Zhang } 3861ba3cbf3SPeng Zhang for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) { 3871ba3cbf3SPeng Zhang if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) { 3881ba3cbf3SPeng Zhang 3891ba3cbf3SPeng Zhang for (; addr - pageaddr < PAGE_SIZE; addr++) { 3901ba3cbf3SPeng Zhang if (!check_canary_byte((u8 *)addr)) 3911ba3cbf3SPeng Zhang return; 3921ba3cbf3SPeng Zhang } 3931ba3cbf3SPeng Zhang } 3940ce20dd8SAlexander Potapenko } 3950ce20dd8SAlexander Potapenko } 3960ce20dd8SAlexander Potapenko 397a9ab52bbSMarco Elver static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, 39808f6b106SMarco Elver unsigned long *stack_entries, size_t num_stack_entries, 39908f6b106SMarco Elver u32 alloc_stack_hash) 4000ce20dd8SAlexander Potapenko { 4010ce20dd8SAlexander Potapenko struct kfence_metadata *meta = NULL; 4020ce20dd8SAlexander Potapenko unsigned long flags; 4038dae0cfeSVlastimil Babka struct slab *slab; 4040ce20dd8SAlexander Potapenko void *addr; 4058032bf12SJason A. Donenfeld const bool random_right_allocate = get_random_u32_below(2); 406327b18b7SJason A. Donenfeld const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && 4078032bf12SJason A. Donenfeld !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); 4080ce20dd8SAlexander Potapenko 4090ce20dd8SAlexander Potapenko /* Try to obtain a free object. */ 4100ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&kfence_freelist_lock, flags); 4110ce20dd8SAlexander Potapenko if (!list_empty(&kfence_freelist)) { 4120ce20dd8SAlexander Potapenko meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); 4130ce20dd8SAlexander Potapenko list_del_init(&meta->list); 4140ce20dd8SAlexander Potapenko } 4150ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); 4169a19aeb5SMarco Elver if (!meta) { 4179a19aeb5SMarco Elver atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); 4180ce20dd8SAlexander Potapenko return NULL; 4199a19aeb5SMarco Elver } 4200ce20dd8SAlexander Potapenko 4210ce20dd8SAlexander Potapenko if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { 4220ce20dd8SAlexander Potapenko /* 4230ce20dd8SAlexander Potapenko * This is extremely unlikely -- we are reporting on a 4240ce20dd8SAlexander Potapenko * use-after-free, which locked meta->lock, and the reporting 4250ce20dd8SAlexander Potapenko * code via printk calls kmalloc() which ends up in 4260ce20dd8SAlexander Potapenko * kfence_alloc() and tries to grab the same object that we're 4270ce20dd8SAlexander Potapenko * reporting on. While it has never been observed, lockdep does 4280ce20dd8SAlexander Potapenko * report that there is a possibility of deadlock. Fix it by 4290ce20dd8SAlexander Potapenko * using trylock and bailing out gracefully. 4300ce20dd8SAlexander Potapenko */ 4310ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&kfence_freelist_lock, flags); 4320ce20dd8SAlexander Potapenko /* Put the object back on the freelist. */ 4330ce20dd8SAlexander Potapenko list_add_tail(&meta->list, &kfence_freelist); 4340ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); 4350ce20dd8SAlexander Potapenko 4360ce20dd8SAlexander Potapenko return NULL; 4370ce20dd8SAlexander Potapenko } 4380ce20dd8SAlexander Potapenko 4390ce20dd8SAlexander Potapenko meta->addr = metadata_to_pageaddr(meta); 4400ce20dd8SAlexander Potapenko /* Unprotect if we're reusing this page. */ 4410ce20dd8SAlexander Potapenko if (meta->state == KFENCE_OBJECT_FREED) 4420ce20dd8SAlexander Potapenko kfence_unprotect(meta->addr); 4430ce20dd8SAlexander Potapenko 4440ce20dd8SAlexander Potapenko /* 4450ce20dd8SAlexander Potapenko * Note: for allocations made before RNG initialization, will always 4460ce20dd8SAlexander Potapenko * return zero. We still benefit from enabling KFENCE as early as 4470ce20dd8SAlexander Potapenko * possible, even when the RNG is not yet available, as this will allow 4480ce20dd8SAlexander Potapenko * KFENCE to detect bugs due to earlier allocations. The only downside 4490ce20dd8SAlexander Potapenko * is that the out-of-bounds accesses detected are deterministic for 4500ce20dd8SAlexander Potapenko * such allocations. 4510ce20dd8SAlexander Potapenko */ 452327b18b7SJason A. Donenfeld if (random_right_allocate) { 4530ce20dd8SAlexander Potapenko /* Allocate on the "right" side, re-calculate address. */ 4540ce20dd8SAlexander Potapenko meta->addr += PAGE_SIZE - size; 4550ce20dd8SAlexander Potapenko meta->addr = ALIGN_DOWN(meta->addr, cache->align); 4560ce20dd8SAlexander Potapenko } 4570ce20dd8SAlexander Potapenko 4580ce20dd8SAlexander Potapenko addr = (void *)meta->addr; 4590ce20dd8SAlexander Potapenko 4600ce20dd8SAlexander Potapenko /* Update remaining metadata. */ 461a9ab52bbSMarco Elver metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); 4620ce20dd8SAlexander Potapenko /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ 4630ce20dd8SAlexander Potapenko WRITE_ONCE(meta->cache, cache); 4640ce20dd8SAlexander Potapenko meta->size = size; 46508f6b106SMarco Elver meta->alloc_stack_hash = alloc_stack_hash; 46649332956SMarco Elver raw_spin_unlock_irqrestore(&meta->lock, flags); 46708f6b106SMarco Elver 46849332956SMarco Elver alloc_covered_add(alloc_stack_hash, 1); 4690ce20dd8SAlexander Potapenko 4708dae0cfeSVlastimil Babka /* Set required slab fields. */ 4718dae0cfeSVlastimil Babka slab = virt_to_slab((void *)meta->addr); 4728dae0cfeSVlastimil Babka slab->slab_cache = cache; 4738dae0cfeSVlastimil Babka slab->objects = 1; 4740ce20dd8SAlexander Potapenko 4750ce20dd8SAlexander Potapenko /* Memory initialization. */ 4761ba3cbf3SPeng Zhang set_canary(meta); 4770ce20dd8SAlexander Potapenko 4780ce20dd8SAlexander Potapenko /* 4790ce20dd8SAlexander Potapenko * We check slab_want_init_on_alloc() ourselves, rather than letting 4800ce20dd8SAlexander Potapenko * SL*B do the initialization, as otherwise we might overwrite KFENCE's 4810ce20dd8SAlexander Potapenko * redzone. 4820ce20dd8SAlexander Potapenko */ 4830ce20dd8SAlexander Potapenko if (unlikely(slab_want_init_on_alloc(gfp, cache))) 4840ce20dd8SAlexander Potapenko memzero_explicit(addr, size); 4850ce20dd8SAlexander Potapenko if (cache->ctor) 4860ce20dd8SAlexander Potapenko cache->ctor(addr); 4870ce20dd8SAlexander Potapenko 488327b18b7SJason A. Donenfeld if (random_fault) 4890ce20dd8SAlexander Potapenko kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ 4900ce20dd8SAlexander Potapenko 4910ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); 4920ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); 4930ce20dd8SAlexander Potapenko 4940ce20dd8SAlexander Potapenko return addr; 4950ce20dd8SAlexander Potapenko } 4960ce20dd8SAlexander Potapenko 4970ce20dd8SAlexander Potapenko static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) 4980ce20dd8SAlexander Potapenko { 4990ce20dd8SAlexander Potapenko struct kcsan_scoped_access assert_page_exclusive; 5000ce20dd8SAlexander Potapenko unsigned long flags; 50149332956SMarco Elver bool init; 5020ce20dd8SAlexander Potapenko 5030ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&meta->lock, flags); 5040ce20dd8SAlexander Potapenko 5050ce20dd8SAlexander Potapenko if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { 5060ce20dd8SAlexander Potapenko /* Invalid or double-free, bail out. */ 5070ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); 508bc8fbc5fSMarco Elver kfence_report_error((unsigned long)addr, false, NULL, meta, 509bc8fbc5fSMarco Elver KFENCE_ERROR_INVALID_FREE); 5100ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&meta->lock, flags); 5110ce20dd8SAlexander Potapenko return; 5120ce20dd8SAlexander Potapenko } 5130ce20dd8SAlexander Potapenko 5140ce20dd8SAlexander Potapenko /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ 5150ce20dd8SAlexander Potapenko kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, 5160ce20dd8SAlexander Potapenko KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, 5170ce20dd8SAlexander Potapenko &assert_page_exclusive); 5180ce20dd8SAlexander Potapenko 5190ce20dd8SAlexander Potapenko if (CONFIG_KFENCE_STRESS_TEST_FAULTS) 5200ce20dd8SAlexander Potapenko kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ 5210ce20dd8SAlexander Potapenko 5220ce20dd8SAlexander Potapenko /* Restore page protection if there was an OOB access. */ 5230ce20dd8SAlexander Potapenko if (meta->unprotected_page) { 52494868a1eSMarco Elver memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); 5250ce20dd8SAlexander Potapenko kfence_protect(meta->unprotected_page); 5260ce20dd8SAlexander Potapenko meta->unprotected_page = 0; 5270ce20dd8SAlexander Potapenko } 5280ce20dd8SAlexander Potapenko 52949332956SMarco Elver /* Mark the object as freed. */ 53049332956SMarco Elver metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); 53149332956SMarco Elver init = slab_want_init_on_free(meta->cache); 53249332956SMarco Elver raw_spin_unlock_irqrestore(&meta->lock, flags); 53349332956SMarco Elver 53449332956SMarco Elver alloc_covered_add(meta->alloc_stack_hash, -1); 53549332956SMarco Elver 5360ce20dd8SAlexander Potapenko /* Check canary bytes for memory corruption. */ 5371ba3cbf3SPeng Zhang check_canary(meta); 5380ce20dd8SAlexander Potapenko 5390ce20dd8SAlexander Potapenko /* 5400ce20dd8SAlexander Potapenko * Clear memory if init-on-free is set. While we protect the page, the 5410ce20dd8SAlexander Potapenko * data is still there, and after a use-after-free is detected, we 5420ce20dd8SAlexander Potapenko * unprotect the page, so the data is still accessible. 5430ce20dd8SAlexander Potapenko */ 54449332956SMarco Elver if (!zombie && unlikely(init)) 5450ce20dd8SAlexander Potapenko memzero_explicit(addr, meta->size); 5460ce20dd8SAlexander Potapenko 5470ce20dd8SAlexander Potapenko /* Protect to detect use-after-frees. */ 5480ce20dd8SAlexander Potapenko kfence_protect((unsigned long)addr); 5490ce20dd8SAlexander Potapenko 5500ce20dd8SAlexander Potapenko kcsan_end_scoped_access(&assert_page_exclusive); 5510ce20dd8SAlexander Potapenko if (!zombie) { 5520ce20dd8SAlexander Potapenko /* Add it to the tail of the freelist for reuse. */ 5530ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&kfence_freelist_lock, flags); 5540ce20dd8SAlexander Potapenko KFENCE_WARN_ON(!list_empty(&meta->list)); 5550ce20dd8SAlexander Potapenko list_add_tail(&meta->list, &kfence_freelist); 5560ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); 5570ce20dd8SAlexander Potapenko 5580ce20dd8SAlexander Potapenko atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); 5590ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); 5600ce20dd8SAlexander Potapenko } else { 5610ce20dd8SAlexander Potapenko /* See kfence_shutdown_cache(). */ 5620ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); 5630ce20dd8SAlexander Potapenko } 5640ce20dd8SAlexander Potapenko } 5650ce20dd8SAlexander Potapenko 5660ce20dd8SAlexander Potapenko static void rcu_guarded_free(struct rcu_head *h) 5670ce20dd8SAlexander Potapenko { 5680ce20dd8SAlexander Potapenko struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); 5690ce20dd8SAlexander Potapenko 5700ce20dd8SAlexander Potapenko kfence_guarded_free((void *)meta->addr, meta, false); 5710ce20dd8SAlexander Potapenko } 5720ce20dd8SAlexander Potapenko 573b33f778bSTianchen Ding /* 574b33f778bSTianchen Ding * Initialization of the KFENCE pool after its allocation. 575b33f778bSTianchen Ding * Returns 0 on success; otherwise returns the address up to 576b33f778bSTianchen Ding * which partial initialization succeeded. 577b33f778bSTianchen Ding */ 578b33f778bSTianchen Ding static unsigned long kfence_init_pool(void) 5790ce20dd8SAlexander Potapenko { 580ec9fee79SEnze Li unsigned long addr; 5810ce20dd8SAlexander Potapenko struct page *pages; 5820ce20dd8SAlexander Potapenko int i; 5830ce20dd8SAlexander Potapenko 5840ce20dd8SAlexander Potapenko if (!arch_kfence_init_pool()) 585ec9fee79SEnze Li return (unsigned long)__kfence_pool; 5860ce20dd8SAlexander Potapenko 587ec9fee79SEnze Li addr = (unsigned long)__kfence_pool; 5889e7ee421SLinus Walleij pages = virt_to_page(__kfence_pool); 5890ce20dd8SAlexander Potapenko 5900ce20dd8SAlexander Potapenko /* 5910ce20dd8SAlexander Potapenko * Set up object pages: they must have PG_slab set, to avoid freeing 5920ce20dd8SAlexander Potapenko * these as real pages. 5930ce20dd8SAlexander Potapenko * 5940ce20dd8SAlexander Potapenko * We also want to avoid inserting kfence_free() in the kfree() 5950ce20dd8SAlexander Potapenko * fast-path in SLUB, and therefore need to ensure kfree() correctly 5960ce20dd8SAlexander Potapenko * enters __slab_free() slow-path. 5970ce20dd8SAlexander Potapenko */ 5980ce20dd8SAlexander Potapenko for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 5991f2803b2SMuchun Song struct slab *slab = page_slab(nth_page(pages, i)); 6008f0b3649SMuchun Song 6010ce20dd8SAlexander Potapenko if (!i || (i % 2)) 6020ce20dd8SAlexander Potapenko continue; 6030ce20dd8SAlexander Potapenko 6048f0b3649SMuchun Song __folio_set_slab(slab_folio(slab)); 605*3a3b7fecSJohannes Weiner #ifdef CONFIG_MEMCG 60621c690a3SSuren Baghdasaryan slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | 60721c690a3SSuren Baghdasaryan MEMCG_DATA_OBJEXTS; 6088f0b3649SMuchun Song #endif 6090ce20dd8SAlexander Potapenko } 6100ce20dd8SAlexander Potapenko 6110ce20dd8SAlexander Potapenko /* 6120ce20dd8SAlexander Potapenko * Protect the first 2 pages. The first page is mostly unnecessary, and 6130ce20dd8SAlexander Potapenko * merely serves as an extended guard page. However, adding one 6140ce20dd8SAlexander Potapenko * additional page in the beginning gives us an even number of pages, 6150ce20dd8SAlexander Potapenko * which simplifies the mapping of address to metadata index. 6160ce20dd8SAlexander Potapenko */ 6170ce20dd8SAlexander Potapenko for (i = 0; i < 2; i++) { 6180ce20dd8SAlexander Potapenko if (unlikely(!kfence_protect(addr))) 619b33f778bSTianchen Ding return addr; 6200ce20dd8SAlexander Potapenko 6210ce20dd8SAlexander Potapenko addr += PAGE_SIZE; 6220ce20dd8SAlexander Potapenko } 6230ce20dd8SAlexander Potapenko 6240ce20dd8SAlexander Potapenko for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { 625cabdf74eSPeng Zhang struct kfence_metadata *meta = &kfence_metadata_init[i]; 6260ce20dd8SAlexander Potapenko 6270ce20dd8SAlexander Potapenko /* Initialize metadata. */ 6280ce20dd8SAlexander Potapenko INIT_LIST_HEAD(&meta->list); 6290ce20dd8SAlexander Potapenko raw_spin_lock_init(&meta->lock); 6300ce20dd8SAlexander Potapenko meta->state = KFENCE_OBJECT_UNUSED; 6310ce20dd8SAlexander Potapenko meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ 6320ce20dd8SAlexander Potapenko list_add_tail(&meta->list, &kfence_freelist); 6330ce20dd8SAlexander Potapenko 6340ce20dd8SAlexander Potapenko /* Protect the right redzone. */ 6350ce20dd8SAlexander Potapenko if (unlikely(!kfence_protect(addr + PAGE_SIZE))) 6363ee2d747SMuchun Song goto reset_slab; 6370ce20dd8SAlexander Potapenko 6380ce20dd8SAlexander Potapenko addr += 2 * PAGE_SIZE; 6390ce20dd8SAlexander Potapenko } 6400ce20dd8SAlexander Potapenko 641cabdf74eSPeng Zhang /* 642cabdf74eSPeng Zhang * Make kfence_metadata visible only when initialization is successful. 643cabdf74eSPeng Zhang * Otherwise, if the initialization fails and kfence_metadata is freed, 644cabdf74eSPeng Zhang * it may cause UAF in kfence_shutdown_cache(). 645cabdf74eSPeng Zhang */ 646cabdf74eSPeng Zhang smp_store_release(&kfence_metadata, kfence_metadata_init); 647b33f778bSTianchen Ding return 0; 6483ee2d747SMuchun Song 6493ee2d747SMuchun Song reset_slab: 6503ee2d747SMuchun Song for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 6511f2803b2SMuchun Song struct slab *slab = page_slab(nth_page(pages, i)); 6523ee2d747SMuchun Song 6533ee2d747SMuchun Song if (!i || (i % 2)) 6543ee2d747SMuchun Song continue; 655*3a3b7fecSJohannes Weiner #ifdef CONFIG_MEMCG 65621c690a3SSuren Baghdasaryan slab->obj_exts = 0; 6573ee2d747SMuchun Song #endif 6583ee2d747SMuchun Song __folio_clear_slab(slab_folio(slab)); 6593ee2d747SMuchun Song } 6603ee2d747SMuchun Song 6613ee2d747SMuchun Song return addr; 662b33f778bSTianchen Ding } 663b33f778bSTianchen Ding 664b33f778bSTianchen Ding static bool __init kfence_init_pool_early(void) 665b33f778bSTianchen Ding { 666b33f778bSTianchen Ding unsigned long addr; 667b33f778bSTianchen Ding 668b33f778bSTianchen Ding if (!__kfence_pool) 669b33f778bSTianchen Ding return false; 670b33f778bSTianchen Ding 671b33f778bSTianchen Ding addr = kfence_init_pool(); 672b33f778bSTianchen Ding 67307313a2bSYee Lee if (!addr) { 67407313a2bSYee Lee /* 67507313a2bSYee Lee * The pool is live and will never be deallocated from this point on. 67607313a2bSYee Lee * Ignore the pool object from the kmemleak phys object tree, as it would 67707313a2bSYee Lee * otherwise overlap with allocations returned by kfence_alloc(), which 67807313a2bSYee Lee * are registered with kmemleak through the slab post-alloc hook. 67907313a2bSYee Lee */ 68007313a2bSYee Lee kmemleak_ignore_phys(__pa(__kfence_pool)); 6810ce20dd8SAlexander Potapenko return true; 68207313a2bSYee Lee } 6830ce20dd8SAlexander Potapenko 6840ce20dd8SAlexander Potapenko /* 6850ce20dd8SAlexander Potapenko * Only release unprotected pages, and do not try to go back and change 6860ce20dd8SAlexander Potapenko * page attributes due to risk of failing to do so as well. If changing 6870ce20dd8SAlexander Potapenko * page attributes for some pages fails, it is very likely that it also 6880ce20dd8SAlexander Potapenko * fails for the first page, and therefore expect addr==__kfence_pool in 6890ce20dd8SAlexander Potapenko * most failure cases. 6900ce20dd8SAlexander Potapenko */ 6910ce20dd8SAlexander Potapenko memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); 6920ce20dd8SAlexander Potapenko __kfence_pool = NULL; 6930ce20dd8SAlexander Potapenko 694cabdf74eSPeng Zhang memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); 695cabdf74eSPeng Zhang kfence_metadata_init = NULL; 696b33f778bSTianchen Ding 697b33f778bSTianchen Ding return false; 698b33f778bSTianchen Ding } 699b33f778bSTianchen Ding 7000ce20dd8SAlexander Potapenko /* === DebugFS Interface ==================================================== */ 7010ce20dd8SAlexander Potapenko 7020ce20dd8SAlexander Potapenko static int stats_show(struct seq_file *seq, void *v) 7030ce20dd8SAlexander Potapenko { 7040ce20dd8SAlexander Potapenko int i; 7050ce20dd8SAlexander Potapenko 7060ce20dd8SAlexander Potapenko seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); 7070ce20dd8SAlexander Potapenko for (i = 0; i < KFENCE_COUNTER_COUNT; i++) 7080ce20dd8SAlexander Potapenko seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); 7090ce20dd8SAlexander Potapenko 7100ce20dd8SAlexander Potapenko return 0; 7110ce20dd8SAlexander Potapenko } 7120ce20dd8SAlexander Potapenko DEFINE_SHOW_ATTRIBUTE(stats); 7130ce20dd8SAlexander Potapenko 7140ce20dd8SAlexander Potapenko /* 7150ce20dd8SAlexander Potapenko * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. 7160ce20dd8SAlexander Potapenko * start_object() and next_object() return the object index + 1, because NULL is used 7170ce20dd8SAlexander Potapenko * to stop iteration. 7180ce20dd8SAlexander Potapenko */ 7190ce20dd8SAlexander Potapenko static void *start_object(struct seq_file *seq, loff_t *pos) 7200ce20dd8SAlexander Potapenko { 7210ce20dd8SAlexander Potapenko if (*pos < CONFIG_KFENCE_NUM_OBJECTS) 7220ce20dd8SAlexander Potapenko return (void *)((long)*pos + 1); 7230ce20dd8SAlexander Potapenko return NULL; 7240ce20dd8SAlexander Potapenko } 7250ce20dd8SAlexander Potapenko 7260ce20dd8SAlexander Potapenko static void stop_object(struct seq_file *seq, void *v) 7270ce20dd8SAlexander Potapenko { 7280ce20dd8SAlexander Potapenko } 7290ce20dd8SAlexander Potapenko 7300ce20dd8SAlexander Potapenko static void *next_object(struct seq_file *seq, void *v, loff_t *pos) 7310ce20dd8SAlexander Potapenko { 7320ce20dd8SAlexander Potapenko ++*pos; 7330ce20dd8SAlexander Potapenko if (*pos < CONFIG_KFENCE_NUM_OBJECTS) 7340ce20dd8SAlexander Potapenko return (void *)((long)*pos + 1); 7350ce20dd8SAlexander Potapenko return NULL; 7360ce20dd8SAlexander Potapenko } 7370ce20dd8SAlexander Potapenko 7380ce20dd8SAlexander Potapenko static int show_object(struct seq_file *seq, void *v) 7390ce20dd8SAlexander Potapenko { 7400ce20dd8SAlexander Potapenko struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; 7410ce20dd8SAlexander Potapenko unsigned long flags; 7420ce20dd8SAlexander Potapenko 7430ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&meta->lock, flags); 7440ce20dd8SAlexander Potapenko kfence_print_object(seq, meta); 7450ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&meta->lock, flags); 7460ce20dd8SAlexander Potapenko seq_puts(seq, "---------------------------------\n"); 7470ce20dd8SAlexander Potapenko 7480ce20dd8SAlexander Potapenko return 0; 7490ce20dd8SAlexander Potapenko } 7500ce20dd8SAlexander Potapenko 7516b1964e6SLiu Shixin static const struct seq_operations objects_sops = { 7520ce20dd8SAlexander Potapenko .start = start_object, 7530ce20dd8SAlexander Potapenko .next = next_object, 7540ce20dd8SAlexander Potapenko .stop = stop_object, 7550ce20dd8SAlexander Potapenko .show = show_object, 7560ce20dd8SAlexander Potapenko }; 7576b1964e6SLiu Shixin DEFINE_SEQ_ATTRIBUTE(objects); 7580ce20dd8SAlexander Potapenko 7591c86a188SMuchun Song static int kfence_debugfs_init(void) 7600ce20dd8SAlexander Potapenko { 7611c86a188SMuchun Song struct dentry *kfence_dir; 7620ce20dd8SAlexander Potapenko 7631c86a188SMuchun Song if (!READ_ONCE(kfence_enabled)) 7641c86a188SMuchun Song return 0; 7651c86a188SMuchun Song 7661c86a188SMuchun Song kfence_dir = debugfs_create_dir("kfence", NULL); 7670ce20dd8SAlexander Potapenko debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); 7680ce20dd8SAlexander Potapenko debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); 7690ce20dd8SAlexander Potapenko return 0; 7700ce20dd8SAlexander Potapenko } 7710ce20dd8SAlexander Potapenko 7720ce20dd8SAlexander Potapenko late_initcall(kfence_debugfs_init); 7730ce20dd8SAlexander Potapenko 7743c81b3bbShuangshaobo /* === Panic Notifier ====================================================== */ 7753c81b3bbShuangshaobo 7763c81b3bbShuangshaobo static void kfence_check_all_canary(void) 7773c81b3bbShuangshaobo { 7783c81b3bbShuangshaobo int i; 7793c81b3bbShuangshaobo 7803c81b3bbShuangshaobo for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { 7813c81b3bbShuangshaobo struct kfence_metadata *meta = &kfence_metadata[i]; 7823c81b3bbShuangshaobo 7833c81b3bbShuangshaobo if (meta->state == KFENCE_OBJECT_ALLOCATED) 7841ba3cbf3SPeng Zhang check_canary(meta); 7853c81b3bbShuangshaobo } 7863c81b3bbShuangshaobo } 7873c81b3bbShuangshaobo 7883c81b3bbShuangshaobo static int kfence_check_canary_callback(struct notifier_block *nb, 7893c81b3bbShuangshaobo unsigned long reason, void *arg) 7903c81b3bbShuangshaobo { 7913c81b3bbShuangshaobo kfence_check_all_canary(); 7923c81b3bbShuangshaobo return NOTIFY_OK; 7933c81b3bbShuangshaobo } 7943c81b3bbShuangshaobo 7953c81b3bbShuangshaobo static struct notifier_block kfence_check_canary_notifier = { 7963c81b3bbShuangshaobo .notifier_call = kfence_check_canary_callback, 7973c81b3bbShuangshaobo }; 7983c81b3bbShuangshaobo 7990ce20dd8SAlexander Potapenko /* === Allocation Gate Timer ================================================ */ 8000ce20dd8SAlexander Potapenko 801737b6a10SMarco Elver static struct delayed_work kfence_timer; 802737b6a10SMarco Elver 803407f1d8cSMarco Elver #ifdef CONFIG_KFENCE_STATIC_KEYS 804407f1d8cSMarco Elver /* Wait queue to wake up allocation-gate timer task. */ 805407f1d8cSMarco Elver static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); 806407f1d8cSMarco Elver 807407f1d8cSMarco Elver static void wake_up_kfence_timer(struct irq_work *work) 808407f1d8cSMarco Elver { 809407f1d8cSMarco Elver wake_up(&allocation_wait); 810407f1d8cSMarco Elver } 811407f1d8cSMarco Elver static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); 812407f1d8cSMarco Elver #endif 813407f1d8cSMarco Elver 8140ce20dd8SAlexander Potapenko /* 8150ce20dd8SAlexander Potapenko * Set up delayed work, which will enable and disable the static key. We need to 8160ce20dd8SAlexander Potapenko * use a work queue (rather than a simple timer), since enabling and disabling a 8170ce20dd8SAlexander Potapenko * static key cannot be done from an interrupt. 8180ce20dd8SAlexander Potapenko * 8190ce20dd8SAlexander Potapenko * Note: Toggling a static branch currently causes IPIs, and here we'll end up 8200ce20dd8SAlexander Potapenko * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with 8210ce20dd8SAlexander Potapenko * more aggressive sampling intervals), we could get away with a variant that 8220ce20dd8SAlexander Potapenko * avoids IPIs, at the cost of not immediately capturing allocations if the 8230ce20dd8SAlexander Potapenko * instructions remain cached. 8240ce20dd8SAlexander Potapenko */ 8250ce20dd8SAlexander Potapenko static void toggle_allocation_gate(struct work_struct *work) 8260ce20dd8SAlexander Potapenko { 8270ce20dd8SAlexander Potapenko if (!READ_ONCE(kfence_enabled)) 8280ce20dd8SAlexander Potapenko return; 8290ce20dd8SAlexander Potapenko 8300ce20dd8SAlexander Potapenko atomic_set(&kfence_allocation_gate, 0); 8310ce20dd8SAlexander Potapenko #ifdef CONFIG_KFENCE_STATIC_KEYS 832407f1d8cSMarco Elver /* Enable static key, and await allocation to happen. */ 8330ce20dd8SAlexander Potapenko static_branch_enable(&kfence_allocation_key); 8340ce20dd8SAlexander Potapenko 8358fd0e995SMarco Elver wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); 836407f1d8cSMarco Elver 8370ce20dd8SAlexander Potapenko /* Disable static key and reset timer. */ 8380ce20dd8SAlexander Potapenko static_branch_disable(&kfence_allocation_key); 8390ce20dd8SAlexander Potapenko #endif 840ff06e45dSMarco Elver queue_delayed_work(system_unbound_wq, &kfence_timer, 84136f0b35dSMarco Elver msecs_to_jiffies(kfence_sample_interval)); 8420ce20dd8SAlexander Potapenko } 8430ce20dd8SAlexander Potapenko 8440ce20dd8SAlexander Potapenko /* === Public interface ===================================================== */ 8450ce20dd8SAlexander Potapenko 846cabdf74eSPeng Zhang void __init kfence_alloc_pool_and_metadata(void) 8470ce20dd8SAlexander Potapenko { 8480ce20dd8SAlexander Potapenko if (!kfence_sample_interval) 8490ce20dd8SAlexander Potapenko return; 8500ce20dd8SAlexander Potapenko 851cabdf74eSPeng Zhang /* 852cabdf74eSPeng Zhang * If the pool has already been initialized by arch, there is no need to 853cabdf74eSPeng Zhang * re-allocate the memory pool. 854cabdf74eSPeng Zhang */ 855cabdf74eSPeng Zhang if (!__kfence_pool) 8560ce20dd8SAlexander Potapenko __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); 8570ce20dd8SAlexander Potapenko 858cabdf74eSPeng Zhang if (!__kfence_pool) { 8590ce20dd8SAlexander Potapenko pr_err("failed to allocate pool\n"); 860cabdf74eSPeng Zhang return; 861cabdf74eSPeng Zhang } 862cabdf74eSPeng Zhang 863cabdf74eSPeng Zhang /* The memory allocated by memblock has been zeroed out. */ 864cabdf74eSPeng Zhang kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); 865cabdf74eSPeng Zhang if (!kfence_metadata_init) { 866cabdf74eSPeng Zhang pr_err("failed to allocate metadata\n"); 867cabdf74eSPeng Zhang memblock_free(__kfence_pool, KFENCE_POOL_SIZE); 868cabdf74eSPeng Zhang __kfence_pool = NULL; 869cabdf74eSPeng Zhang } 8700ce20dd8SAlexander Potapenko } 8710ce20dd8SAlexander Potapenko 872b33f778bSTianchen Ding static void kfence_init_enable(void) 8730ce20dd8SAlexander Potapenko { 87407e8481dSMarco Elver if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) 87507e8481dSMarco Elver static_branch_enable(&kfence_allocation_key); 876737b6a10SMarco Elver 877737b6a10SMarco Elver if (kfence_deferrable) 878737b6a10SMarco Elver INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); 879737b6a10SMarco Elver else 880737b6a10SMarco Elver INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); 881737b6a10SMarco Elver 8823c81b3bbShuangshaobo if (kfence_check_on_panic) 8833c81b3bbShuangshaobo atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); 8843c81b3bbShuangshaobo 8850ce20dd8SAlexander Potapenko WRITE_ONCE(kfence_enabled, true); 886ff06e45dSMarco Elver queue_delayed_work(system_unbound_wq, &kfence_timer, 0); 887737b6a10SMarco Elver 88835beccf0SMarco Elver pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, 88935beccf0SMarco Elver CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, 8900ce20dd8SAlexander Potapenko (void *)(__kfence_pool + KFENCE_POOL_SIZE)); 8910ce20dd8SAlexander Potapenko } 8920ce20dd8SAlexander Potapenko 893b33f778bSTianchen Ding void __init kfence_init(void) 894b33f778bSTianchen Ding { 89508475dabSJason A. Donenfeld stack_hash_seed = get_random_u32(); 896b33f778bSTianchen Ding 897b33f778bSTianchen Ding /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ 898b33f778bSTianchen Ding if (!kfence_sample_interval) 899b33f778bSTianchen Ding return; 900b33f778bSTianchen Ding 901b33f778bSTianchen Ding if (!kfence_init_pool_early()) { 902b33f778bSTianchen Ding pr_err("%s failed\n", __func__); 903b33f778bSTianchen Ding return; 904b33f778bSTianchen Ding } 905b33f778bSTianchen Ding 906b33f778bSTianchen Ding kfence_init_enable(); 907b33f778bSTianchen Ding } 908b33f778bSTianchen Ding 909b33f778bSTianchen Ding static int kfence_init_late(void) 910b33f778bSTianchen Ding { 911cabdf74eSPeng Zhang const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; 912cabdf74eSPeng Zhang const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; 913cabdf74eSPeng Zhang unsigned long addr = (unsigned long)__kfence_pool; 914cabdf74eSPeng Zhang unsigned long free_size = KFENCE_POOL_SIZE; 915cabdf74eSPeng Zhang int err = -ENOMEM; 916cabdf74eSPeng Zhang 917b33f778bSTianchen Ding #ifdef CONFIG_CONTIG_ALLOC 918b33f778bSTianchen Ding struct page *pages; 919b33f778bSTianchen Ding 920cabdf74eSPeng Zhang pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, 921cabdf74eSPeng Zhang NULL); 922b33f778bSTianchen Ding if (!pages) 923b33f778bSTianchen Ding return -ENOMEM; 924cabdf74eSPeng Zhang 925b33f778bSTianchen Ding __kfence_pool = page_to_virt(pages); 926cabdf74eSPeng Zhang pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, 927cabdf74eSPeng Zhang NULL); 928cabdf74eSPeng Zhang if (pages) 929cabdf74eSPeng Zhang kfence_metadata_init = page_to_virt(pages); 930b33f778bSTianchen Ding #else 931cabdf74eSPeng Zhang if (nr_pages_pool > MAX_ORDER_NR_PAGES || 932cabdf74eSPeng Zhang nr_pages_meta > MAX_ORDER_NR_PAGES) { 933b33f778bSTianchen Ding pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); 934b33f778bSTianchen Ding return -EINVAL; 935b33f778bSTianchen Ding } 936cabdf74eSPeng Zhang 937b33f778bSTianchen Ding __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); 938b33f778bSTianchen Ding if (!__kfence_pool) 939b33f778bSTianchen Ding return -ENOMEM; 940cabdf74eSPeng Zhang 941cabdf74eSPeng Zhang kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); 942b33f778bSTianchen Ding #endif 943b33f778bSTianchen Ding 944cabdf74eSPeng Zhang if (!kfence_metadata_init) 945cabdf74eSPeng Zhang goto free_pool; 946b33f778bSTianchen Ding 947cabdf74eSPeng Zhang memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); 948cabdf74eSPeng Zhang addr = kfence_init_pool(); 949cabdf74eSPeng Zhang if (!addr) { 950b33f778bSTianchen Ding kfence_init_enable(); 9511c86a188SMuchun Song kfence_debugfs_init(); 952b33f778bSTianchen Ding return 0; 953b33f778bSTianchen Ding } 954b33f778bSTianchen Ding 955cabdf74eSPeng Zhang pr_err("%s failed\n", __func__); 956cabdf74eSPeng Zhang free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); 957cabdf74eSPeng Zhang err = -EBUSY; 958cabdf74eSPeng Zhang 959cabdf74eSPeng Zhang #ifdef CONFIG_CONTIG_ALLOC 960cabdf74eSPeng Zhang free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), 961cabdf74eSPeng Zhang nr_pages_meta); 962cabdf74eSPeng Zhang free_pool: 963cabdf74eSPeng Zhang free_contig_range(page_to_pfn(virt_to_page((void *)addr)), 964cabdf74eSPeng Zhang free_size / PAGE_SIZE); 965cabdf74eSPeng Zhang #else 966cabdf74eSPeng Zhang free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); 967cabdf74eSPeng Zhang free_pool: 968cabdf74eSPeng Zhang free_pages_exact((void *)addr, free_size); 969cabdf74eSPeng Zhang #endif 970cabdf74eSPeng Zhang 971cabdf74eSPeng Zhang kfence_metadata_init = NULL; 972cabdf74eSPeng Zhang __kfence_pool = NULL; 973cabdf74eSPeng Zhang return err; 974cabdf74eSPeng Zhang } 975cabdf74eSPeng Zhang 976698361bcSTianchen Ding static int kfence_enable_late(void) 977698361bcSTianchen Ding { 978698361bcSTianchen Ding if (!__kfence_pool) 979b33f778bSTianchen Ding return kfence_init_late(); 980698361bcSTianchen Ding 981698361bcSTianchen Ding WRITE_ONCE(kfence_enabled, true); 982698361bcSTianchen Ding queue_delayed_work(system_unbound_wq, &kfence_timer, 0); 98383d7d04fSJackie Liu pr_info("re-enabled\n"); 984698361bcSTianchen Ding return 0; 985698361bcSTianchen Ding } 986698361bcSTianchen Ding 9870ce20dd8SAlexander Potapenko void kfence_shutdown_cache(struct kmem_cache *s) 9880ce20dd8SAlexander Potapenko { 9890ce20dd8SAlexander Potapenko unsigned long flags; 9900ce20dd8SAlexander Potapenko struct kfence_metadata *meta; 9910ce20dd8SAlexander Potapenko int i; 9920ce20dd8SAlexander Potapenko 993cabdf74eSPeng Zhang /* Pairs with release in kfence_init_pool(). */ 994cabdf74eSPeng Zhang if (!smp_load_acquire(&kfence_metadata)) 995cabdf74eSPeng Zhang return; 996cabdf74eSPeng Zhang 9970ce20dd8SAlexander Potapenko for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { 9980ce20dd8SAlexander Potapenko bool in_use; 9990ce20dd8SAlexander Potapenko 10000ce20dd8SAlexander Potapenko meta = &kfence_metadata[i]; 10010ce20dd8SAlexander Potapenko 10020ce20dd8SAlexander Potapenko /* 10030ce20dd8SAlexander Potapenko * If we observe some inconsistent cache and state pair where we 10040ce20dd8SAlexander Potapenko * should have returned false here, cache destruction is racing 10050ce20dd8SAlexander Potapenko * with either kmem_cache_alloc() or kmem_cache_free(). Taking 10060ce20dd8SAlexander Potapenko * the lock will not help, as different critical section 10070ce20dd8SAlexander Potapenko * serialization will have the same outcome. 10080ce20dd8SAlexander Potapenko */ 10090ce20dd8SAlexander Potapenko if (READ_ONCE(meta->cache) != s || 10100ce20dd8SAlexander Potapenko READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) 10110ce20dd8SAlexander Potapenko continue; 10120ce20dd8SAlexander Potapenko 10130ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&meta->lock, flags); 10140ce20dd8SAlexander Potapenko in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; 10150ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&meta->lock, flags); 10160ce20dd8SAlexander Potapenko 10170ce20dd8SAlexander Potapenko if (in_use) { 10180ce20dd8SAlexander Potapenko /* 10190ce20dd8SAlexander Potapenko * This cache still has allocations, and we should not 10200ce20dd8SAlexander Potapenko * release them back into the freelist so they can still 10210ce20dd8SAlexander Potapenko * safely be used and retain the kernel's default 10220ce20dd8SAlexander Potapenko * behaviour of keeping the allocations alive (leak the 10230ce20dd8SAlexander Potapenko * cache); however, they effectively become "zombie 10240ce20dd8SAlexander Potapenko * allocations" as the KFENCE objects are the only ones 10250ce20dd8SAlexander Potapenko * still in use and the owning cache is being destroyed. 10260ce20dd8SAlexander Potapenko * 10270ce20dd8SAlexander Potapenko * We mark them freed, so that any subsequent use shows 10280ce20dd8SAlexander Potapenko * more useful error messages that will include stack 10290ce20dd8SAlexander Potapenko * traces of the user of the object, the original 10300ce20dd8SAlexander Potapenko * allocation, and caller to shutdown_cache(). 10310ce20dd8SAlexander Potapenko */ 10320ce20dd8SAlexander Potapenko kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); 10330ce20dd8SAlexander Potapenko } 10340ce20dd8SAlexander Potapenko } 10350ce20dd8SAlexander Potapenko 10360ce20dd8SAlexander Potapenko for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { 10370ce20dd8SAlexander Potapenko meta = &kfence_metadata[i]; 10380ce20dd8SAlexander Potapenko 10390ce20dd8SAlexander Potapenko /* See above. */ 10400ce20dd8SAlexander Potapenko if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) 10410ce20dd8SAlexander Potapenko continue; 10420ce20dd8SAlexander Potapenko 10430ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&meta->lock, flags); 10440ce20dd8SAlexander Potapenko if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) 10450ce20dd8SAlexander Potapenko meta->cache = NULL; 10460ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&meta->lock, flags); 10470ce20dd8SAlexander Potapenko } 10480ce20dd8SAlexander Potapenko } 10490ce20dd8SAlexander Potapenko 10500ce20dd8SAlexander Potapenko void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) 10510ce20dd8SAlexander Potapenko { 1052a9ab52bbSMarco Elver unsigned long stack_entries[KFENCE_STACK_DEPTH]; 1053a9ab52bbSMarco Elver size_t num_stack_entries; 105408f6b106SMarco Elver u32 alloc_stack_hash; 1055a9ab52bbSMarco Elver 10560ce20dd8SAlexander Potapenko /* 1057235a85cbSAlexander Potapenko * Perform size check before switching kfence_allocation_gate, so that 1058235a85cbSAlexander Potapenko * we don't disable KFENCE without making an allocation. 1059235a85cbSAlexander Potapenko */ 10609a19aeb5SMarco Elver if (size > PAGE_SIZE) { 10619a19aeb5SMarco Elver atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); 1062235a85cbSAlexander Potapenko return NULL; 10639a19aeb5SMarco Elver } 1064235a85cbSAlexander Potapenko 1065235a85cbSAlexander Potapenko /* 1066236e9f15SAlexander Potapenko * Skip allocations from non-default zones, including DMA. We cannot 1067236e9f15SAlexander Potapenko * guarantee that pages in the KFENCE pool will have the requested 1068236e9f15SAlexander Potapenko * properties (e.g. reside in DMAable memory). 1069236e9f15SAlexander Potapenko */ 1070236e9f15SAlexander Potapenko if ((flags & GFP_ZONEMASK) || 10719a19aeb5SMarco Elver (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { 10729a19aeb5SMarco Elver atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); 1073236e9f15SAlexander Potapenko return NULL; 10749a19aeb5SMarco Elver } 1075236e9f15SAlexander Potapenko 1076b84e04f1SImran Khan /* 1077b84e04f1SImran Khan * Skip allocations for this slab, if KFENCE has been disabled for 1078b84e04f1SImran Khan * this slab. 1079b84e04f1SImran Khan */ 1080b84e04f1SImran Khan if (s->flags & SLAB_SKIP_KFENCE) 1081b84e04f1SImran Khan return NULL; 1082b84e04f1SImran Khan 108307e8481dSMarco Elver if (atomic_inc_return(&kfence_allocation_gate) > 1) 10840ce20dd8SAlexander Potapenko return NULL; 1085407f1d8cSMarco Elver #ifdef CONFIG_KFENCE_STATIC_KEYS 1086407f1d8cSMarco Elver /* 1087407f1d8cSMarco Elver * waitqueue_active() is fully ordered after the update of 1088407f1d8cSMarco Elver * kfence_allocation_gate per atomic_inc_return(). 1089407f1d8cSMarco Elver */ 1090407f1d8cSMarco Elver if (waitqueue_active(&allocation_wait)) { 1091407f1d8cSMarco Elver /* 1092407f1d8cSMarco Elver * Calling wake_up() here may deadlock when allocations happen 1093407f1d8cSMarco Elver * from within timer code. Use an irq_work to defer it. 1094407f1d8cSMarco Elver */ 1095407f1d8cSMarco Elver irq_work_queue(&wake_up_kfence_timer_work); 1096407f1d8cSMarco Elver } 1097407f1d8cSMarco Elver #endif 10980ce20dd8SAlexander Potapenko 10990ce20dd8SAlexander Potapenko if (!READ_ONCE(kfence_enabled)) 11000ce20dd8SAlexander Potapenko return NULL; 11010ce20dd8SAlexander Potapenko 1102a9ab52bbSMarco Elver num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); 1103a9ab52bbSMarco Elver 110408f6b106SMarco Elver /* 110508f6b106SMarco Elver * Do expensive check for coverage of allocation in slow-path after 110608f6b106SMarco Elver * allocation_gate has already become non-zero, even though it might 110708f6b106SMarco Elver * mean not making any allocation within a given sample interval. 110808f6b106SMarco Elver * 110908f6b106SMarco Elver * This ensures reasonable allocation coverage when the pool is almost 111008f6b106SMarco Elver * full, including avoiding long-lived allocations of the same source 111108f6b106SMarco Elver * filling up the pool (e.g. pagecache allocations). 111208f6b106SMarco Elver */ 111308f6b106SMarco Elver alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); 111408f6b106SMarco Elver if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { 111508f6b106SMarco Elver atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); 111608f6b106SMarco Elver return NULL; 111708f6b106SMarco Elver } 111808f6b106SMarco Elver 111908f6b106SMarco Elver return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, 112008f6b106SMarco Elver alloc_stack_hash); 11210ce20dd8SAlexander Potapenko } 11220ce20dd8SAlexander Potapenko 11230ce20dd8SAlexander Potapenko size_t kfence_ksize(const void *addr) 11240ce20dd8SAlexander Potapenko { 11250ce20dd8SAlexander Potapenko const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); 11260ce20dd8SAlexander Potapenko 11270ce20dd8SAlexander Potapenko /* 11280ce20dd8SAlexander Potapenko * Read locklessly -- if there is a race with __kfence_alloc(), this is 11290ce20dd8SAlexander Potapenko * either a use-after-free or invalid access. 11300ce20dd8SAlexander Potapenko */ 11310ce20dd8SAlexander Potapenko return meta ? meta->size : 0; 11320ce20dd8SAlexander Potapenko } 11330ce20dd8SAlexander Potapenko 11340ce20dd8SAlexander Potapenko void *kfence_object_start(const void *addr) 11350ce20dd8SAlexander Potapenko { 11360ce20dd8SAlexander Potapenko const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); 11370ce20dd8SAlexander Potapenko 11380ce20dd8SAlexander Potapenko /* 11390ce20dd8SAlexander Potapenko * Read locklessly -- if there is a race with __kfence_alloc(), this is 11400ce20dd8SAlexander Potapenko * either a use-after-free or invalid access. 11410ce20dd8SAlexander Potapenko */ 11420ce20dd8SAlexander Potapenko return meta ? (void *)meta->addr : NULL; 11430ce20dd8SAlexander Potapenko } 11440ce20dd8SAlexander Potapenko 11450ce20dd8SAlexander Potapenko void __kfence_free(void *addr) 11460ce20dd8SAlexander Potapenko { 11470ce20dd8SAlexander Potapenko struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); 11480ce20dd8SAlexander Potapenko 1149*3a3b7fecSJohannes Weiner #ifdef CONFIG_MEMCG 115021c690a3SSuren Baghdasaryan KFENCE_WARN_ON(meta->obj_exts.objcg); 11518f0b3649SMuchun Song #endif 11520ce20dd8SAlexander Potapenko /* 11530ce20dd8SAlexander Potapenko * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing 11540ce20dd8SAlexander Potapenko * the object, as the object page may be recycled for other-typed 11550ce20dd8SAlexander Potapenko * objects once it has been freed. meta->cache may be NULL if the cache 11560ce20dd8SAlexander Potapenko * was destroyed. 11570ce20dd8SAlexander Potapenko */ 11580ce20dd8SAlexander Potapenko if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) 11590ce20dd8SAlexander Potapenko call_rcu(&meta->rcu_head, rcu_guarded_free); 11600ce20dd8SAlexander Potapenko else 11610ce20dd8SAlexander Potapenko kfence_guarded_free(addr, meta, false); 11620ce20dd8SAlexander Potapenko } 11630ce20dd8SAlexander Potapenko 1164bc8fbc5fSMarco Elver bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) 11650ce20dd8SAlexander Potapenko { 11660ce20dd8SAlexander Potapenko const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; 11670ce20dd8SAlexander Potapenko struct kfence_metadata *to_report = NULL; 11680ce20dd8SAlexander Potapenko enum kfence_error_type error_type; 11690ce20dd8SAlexander Potapenko unsigned long flags; 11700ce20dd8SAlexander Potapenko 11710ce20dd8SAlexander Potapenko if (!is_kfence_address((void *)addr)) 11720ce20dd8SAlexander Potapenko return false; 11730ce20dd8SAlexander Potapenko 11740ce20dd8SAlexander Potapenko if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ 11750ce20dd8SAlexander Potapenko return kfence_unprotect(addr); /* ... unprotect and proceed. */ 11760ce20dd8SAlexander Potapenko 11770ce20dd8SAlexander Potapenko atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); 11780ce20dd8SAlexander Potapenko 11790ce20dd8SAlexander Potapenko if (page_index % 2) { 11800ce20dd8SAlexander Potapenko /* This is a redzone, report a buffer overflow. */ 11810ce20dd8SAlexander Potapenko struct kfence_metadata *meta; 11820ce20dd8SAlexander Potapenko int distance = 0; 11830ce20dd8SAlexander Potapenko 11840ce20dd8SAlexander Potapenko meta = addr_to_metadata(addr - PAGE_SIZE); 11850ce20dd8SAlexander Potapenko if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { 11860ce20dd8SAlexander Potapenko to_report = meta; 11870ce20dd8SAlexander Potapenko /* Data race ok; distance calculation approximate. */ 11880ce20dd8SAlexander Potapenko distance = addr - data_race(meta->addr + meta->size); 11890ce20dd8SAlexander Potapenko } 11900ce20dd8SAlexander Potapenko 11910ce20dd8SAlexander Potapenko meta = addr_to_metadata(addr + PAGE_SIZE); 11920ce20dd8SAlexander Potapenko if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { 11930ce20dd8SAlexander Potapenko /* Data race ok; distance calculation approximate. */ 11940ce20dd8SAlexander Potapenko if (!to_report || distance > data_race(meta->addr) - addr) 11950ce20dd8SAlexander Potapenko to_report = meta; 11960ce20dd8SAlexander Potapenko } 11970ce20dd8SAlexander Potapenko 11980ce20dd8SAlexander Potapenko if (!to_report) 11990ce20dd8SAlexander Potapenko goto out; 12000ce20dd8SAlexander Potapenko 12010ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&to_report->lock, flags); 12020ce20dd8SAlexander Potapenko to_report->unprotected_page = addr; 12030ce20dd8SAlexander Potapenko error_type = KFENCE_ERROR_OOB; 12040ce20dd8SAlexander Potapenko 12050ce20dd8SAlexander Potapenko /* 12060ce20dd8SAlexander Potapenko * If the object was freed before we took the look we can still 12070ce20dd8SAlexander Potapenko * report this as an OOB -- the report will simply show the 12080ce20dd8SAlexander Potapenko * stacktrace of the free as well. 12090ce20dd8SAlexander Potapenko */ 12100ce20dd8SAlexander Potapenko } else { 12110ce20dd8SAlexander Potapenko to_report = addr_to_metadata(addr); 12120ce20dd8SAlexander Potapenko if (!to_report) 12130ce20dd8SAlexander Potapenko goto out; 12140ce20dd8SAlexander Potapenko 12150ce20dd8SAlexander Potapenko raw_spin_lock_irqsave(&to_report->lock, flags); 12160ce20dd8SAlexander Potapenko error_type = KFENCE_ERROR_UAF; 12170ce20dd8SAlexander Potapenko /* 12180ce20dd8SAlexander Potapenko * We may race with __kfence_alloc(), and it is possible that a 12190ce20dd8SAlexander Potapenko * freed object may be reallocated. We simply report this as a 12200ce20dd8SAlexander Potapenko * use-after-free, with the stack trace showing the place where 12210ce20dd8SAlexander Potapenko * the object was re-allocated. 12220ce20dd8SAlexander Potapenko */ 12230ce20dd8SAlexander Potapenko } 12240ce20dd8SAlexander Potapenko 12250ce20dd8SAlexander Potapenko out: 12260ce20dd8SAlexander Potapenko if (to_report) { 1227bc8fbc5fSMarco Elver kfence_report_error(addr, is_write, regs, to_report, error_type); 12280ce20dd8SAlexander Potapenko raw_spin_unlock_irqrestore(&to_report->lock, flags); 12290ce20dd8SAlexander Potapenko } else { 12300ce20dd8SAlexander Potapenko /* This may be a UAF or OOB access, but we can't be sure. */ 1231bc8fbc5fSMarco Elver kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); 12320ce20dd8SAlexander Potapenko } 12330ce20dd8SAlexander Potapenko 12340ce20dd8SAlexander Potapenko return kfence_unprotect(addr); /* Unprotect and let access proceed. */ 12350ce20dd8SAlexander Potapenko } 1236