1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/mm.h> 10 #include <linux/smp.h> 11 #include <linux/types.h> 12 #include <asm/cpufeature.h> 13 #include <asm/hwprobe.h> 14 15 #include "copy-unaligned.h" 16 17 #define MISALIGNED_ACCESS_JIFFIES_LG2 1 18 #define MISALIGNED_BUFFER_SIZE 0x4000 19 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 20 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 21 22 DEFINE_PER_CPU(long, misaligned_access_speed); 23 24 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 25 static cpumask_t fast_misaligned_access; 26 static int check_unaligned_access(void *param) 27 { 28 int cpu = smp_processor_id(); 29 u64 start_cycles, end_cycles; 30 u64 word_cycles; 31 u64 byte_cycles; 32 int ratio; 33 unsigned long start_jiffies, now; 34 struct page *page = param; 35 void *dst; 36 void *src; 37 long speed = RISCV_HWPROBE_MISALIGNED_SLOW; 38 39 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) 40 return 0; 41 42 /* Make an unaligned destination buffer. */ 43 dst = (void *)((unsigned long)page_address(page) | 0x1); 44 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 45 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 46 src += 2; 47 word_cycles = -1ULL; 48 /* Do a warmup. */ 49 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 50 preempt_disable(); 51 start_jiffies = jiffies; 52 while ((now = jiffies) == start_jiffies) 53 cpu_relax(); 54 55 /* 56 * For a fixed amount of time, repeatedly try the function, and take 57 * the best time in cycles as the measurement. 58 */ 59 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 60 start_cycles = get_cycles64(); 61 /* Ensure the CSR read can't reorder WRT to the copy. */ 62 mb(); 63 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 64 /* Ensure the copy ends before the end time is snapped. */ 65 mb(); 66 end_cycles = get_cycles64(); 67 if ((end_cycles - start_cycles) < word_cycles) 68 word_cycles = end_cycles - start_cycles; 69 } 70 71 byte_cycles = -1ULL; 72 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 73 start_jiffies = jiffies; 74 while ((now = jiffies) == start_jiffies) 75 cpu_relax(); 76 77 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 78 start_cycles = get_cycles64(); 79 mb(); 80 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 81 mb(); 82 end_cycles = get_cycles64(); 83 if ((end_cycles - start_cycles) < byte_cycles) 84 byte_cycles = end_cycles - start_cycles; 85 } 86 87 preempt_enable(); 88 89 /* Don't divide by zero. */ 90 if (!word_cycles || !byte_cycles) { 91 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", 92 cpu); 93 94 return 0; 95 } 96 97 if (word_cycles < byte_cycles) 98 speed = RISCV_HWPROBE_MISALIGNED_FAST; 99 100 ratio = div_u64((byte_cycles * 100), word_cycles); 101 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", 102 cpu, 103 ratio / 100, 104 ratio % 100, 105 (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); 106 107 per_cpu(misaligned_access_speed, cpu) = speed; 108 109 /* 110 * Set the value of fast_misaligned_access of a CPU. These operations 111 * are atomic to avoid race conditions. 112 */ 113 if (speed == RISCV_HWPROBE_MISALIGNED_FAST) 114 cpumask_set_cpu(cpu, &fast_misaligned_access); 115 else 116 cpumask_clear_cpu(cpu, &fast_misaligned_access); 117 118 return 0; 119 } 120 121 static void check_unaligned_access_nonboot_cpu(void *param) 122 { 123 unsigned int cpu = smp_processor_id(); 124 struct page **pages = param; 125 126 if (smp_processor_id() != 0) 127 check_unaligned_access(pages[cpu]); 128 } 129 130 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 131 132 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 133 { 134 if (cpumask_weight(mask) == weight) 135 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 136 else 137 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 138 } 139 140 static void set_unaligned_access_static_branches_except_cpu(int cpu) 141 { 142 /* 143 * Same as set_unaligned_access_static_branches, except excludes the 144 * given CPU from the result. When a CPU is hotplugged into an offline 145 * state, this function is called before the CPU is set to offline in 146 * the cpumask, and thus the CPU needs to be explicitly excluded. 147 */ 148 149 cpumask_t fast_except_me; 150 151 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 152 cpumask_clear_cpu(cpu, &fast_except_me); 153 154 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 155 } 156 157 static void set_unaligned_access_static_branches(void) 158 { 159 /* 160 * This will be called after check_unaligned_access_all_cpus so the 161 * result of unaligned access speed for all CPUs will be available. 162 * 163 * To avoid the number of online cpus changing between reading 164 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 165 * held before calling this function. 166 */ 167 168 cpumask_t fast_and_online; 169 170 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 171 172 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 173 } 174 175 static int lock_and_set_unaligned_access_static_branch(void) 176 { 177 cpus_read_lock(); 178 set_unaligned_access_static_branches(); 179 cpus_read_unlock(); 180 181 return 0; 182 } 183 184 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 185 186 static int riscv_online_cpu(unsigned int cpu) 187 { 188 static struct page *buf; 189 190 /* We are already set since the last check */ 191 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) 192 goto exit; 193 194 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 195 if (!buf) { 196 pr_warn("Allocation failure, not measuring misaligned performance\n"); 197 return -ENOMEM; 198 } 199 200 check_unaligned_access(buf); 201 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 202 203 exit: 204 set_unaligned_access_static_branches(); 205 206 return 0; 207 } 208 209 static int riscv_offline_cpu(unsigned int cpu) 210 { 211 set_unaligned_access_static_branches_except_cpu(cpu); 212 213 return 0; 214 } 215 216 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 217 static int check_unaligned_access_speed_all_cpus(void) 218 { 219 unsigned int cpu; 220 unsigned int cpu_count = num_possible_cpus(); 221 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); 222 223 if (!bufs) { 224 pr_warn("Allocation failure, not measuring misaligned performance\n"); 225 return 0; 226 } 227 228 /* 229 * Allocate separate buffers for each CPU so there's no fighting over 230 * cache lines. 231 */ 232 for_each_cpu(cpu, cpu_online_mask) { 233 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 234 if (!bufs[cpu]) { 235 pr_warn("Allocation failure, not measuring misaligned performance\n"); 236 goto out; 237 } 238 } 239 240 /* Check everybody except 0, who stays behind to tend jiffies. */ 241 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); 242 243 /* Check core 0. */ 244 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); 245 246 /* 247 * Setup hotplug callbacks for any new CPUs that come online or go 248 * offline. 249 */ 250 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 251 riscv_online_cpu, riscv_offline_cpu); 252 253 out: 254 for_each_cpu(cpu, cpu_online_mask) { 255 if (bufs[cpu]) 256 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 257 } 258 259 kfree(bufs); 260 return 0; 261 } 262 263 static int check_unaligned_access_all_cpus(void) 264 { 265 bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); 266 267 if (!all_cpus_emulated) 268 return check_unaligned_access_speed_all_cpus(); 269 270 return 0; 271 } 272 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 273 static int check_unaligned_access_all_cpus(void) 274 { 275 check_unaligned_access_emulated_all_cpus(); 276 277 return 0; 278 } 279 #endif 280 281 arch_initcall(check_unaligned_access_all_cpus); 282