1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_NS 8000000 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; 29 30 static u64 __maybe_unused 31 measure_cycles(void (*func)(void *dst, const void *src, size_t len), 32 void *dst, void *src, size_t len) 33 { 34 u64 start_cycles, end_cycles, cycles = -1ULL; 35 u64 start_ns; 36 37 /* Do a warmup. */ 38 func(dst, src, len); 39 40 preempt_disable(); 41 42 /* 43 * For a fixed amount of time, repeatedly try the function, and take 44 * the best time in cycles as the measurement. 45 */ 46 start_ns = ktime_get_mono_fast_ns(); 47 while (ktime_get_mono_fast_ns() < start_ns + MISALIGNED_ACCESS_NS) { 48 start_cycles = get_cycles64(); 49 /* Ensure the CSR read can't reorder WRT to the copy. */ 50 mb(); 51 func(dst, src, len); 52 /* Ensure the copy ends before the end time is snapped. */ 53 mb(); 54 end_cycles = get_cycles64(); 55 if ((end_cycles - start_cycles) < cycles) 56 cycles = end_cycles - start_cycles; 57 } 58 59 preempt_enable(); 60 61 return cycles; 62 } 63 64 /* 65 * Return: 66 * 1 if unaligned accesses are fast 67 * 0 if unaligned accesses are slow 68 * -1 if check cannot be done 69 */ 70 static int __maybe_unused 71 compare_unaligned_access(void (*word_copy)(void *dst, const void *src, size_t len), 72 void (*byte_copy)(void *dst, const void *src, size_t len), 73 void *buf, const char *type) 74 { 75 int cpu = smp_processor_id(); 76 u64 word_cycles; 77 u64 byte_cycles; 78 void *dst, *src; 79 bool fast; 80 int ratio; 81 82 /* Make an unaligned destination buffer. */ 83 dst = (void *)((unsigned long)buf | 0x1); 84 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 85 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 86 src += 2; 87 88 word_cycles = measure_cycles(word_copy, dst, src, MISALIGNED_COPY_SIZE); 89 byte_cycles = measure_cycles(byte_copy, dst, src, MISALIGNED_COPY_SIZE); 90 91 /* Don't divide by zero. */ 92 if (!word_cycles || !byte_cycles) { 93 pr_warn("cpu%d: rdtime lacks granularity needed to measure %s unaligned access speed\n", 94 cpu, type); 95 96 return -1; 97 } 98 99 fast = word_cycles < byte_cycles; 100 101 ratio = div_u64((byte_cycles * 100), word_cycles); 102 pr_info("cpu%d: %s unaligned word access speed is %d.%02dx byte access speed (%s)\n", 103 cpu, 104 type, 105 ratio / 100, 106 ratio % 100, 107 fast ? "fast" : "slow"); 108 109 return fast; 110 } 111 112 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 113 static int check_unaligned_access(struct page *page) 114 { 115 void *buf = page_address(page); 116 int cpu = smp_processor_id(); 117 int ret; 118 119 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 120 return 0; 121 122 ret = compare_unaligned_access(__riscv_copy_words_unaligned, 123 __riscv_copy_bytes_unaligned, 124 buf, "scalar"); 125 if (ret < 0) 126 return 0; 127 128 /* 129 * Set the value of fast_misaligned_access of a CPU. These operations 130 * are atomic to avoid race conditions. 131 */ 132 if (ret) 133 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 134 else 135 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 136 137 return 0; 138 } 139 140 static void __init _check_unaligned_access(void *param) 141 { 142 unsigned int cpu = smp_processor_id(); 143 struct page **pages = param; 144 145 check_unaligned_access(pages[cpu]); 146 } 147 148 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 149 static void __init check_unaligned_access_speed_all_cpus(void) 150 { 151 unsigned int cpu; 152 unsigned int cpu_count = num_possible_cpus(); 153 struct page **bufs = kzalloc_objs(*bufs, cpu_count); 154 155 if (!bufs) { 156 pr_warn("Allocation failure, not measuring misaligned performance\n"); 157 return; 158 } 159 160 /* 161 * Allocate separate buffers for each CPU so there's no fighting over 162 * cache lines. 163 */ 164 for_each_cpu(cpu, cpu_online_mask) { 165 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 166 if (!bufs[cpu]) { 167 pr_warn("Allocation failure, not measuring misaligned performance\n"); 168 goto out; 169 } 170 } 171 172 on_each_cpu(_check_unaligned_access, bufs, 1); 173 174 out: 175 for_each_cpu(cpu, cpu_online_mask) { 176 if (bufs[cpu]) 177 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 178 } 179 180 kfree(bufs); 181 } 182 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 183 static void __init check_unaligned_access_speed_all_cpus(void) 184 { 185 } 186 #endif 187 188 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 189 190 static void modify_unaligned_access_branches(const cpumask_t *mask) 191 { 192 bool fast = true; 193 int cpu; 194 195 for_each_cpu(cpu, mask) { 196 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) { 197 fast = false; 198 break; 199 } 200 } 201 202 if (fast) 203 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 204 else 205 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 206 } 207 208 static int riscv_online_cpu(unsigned int cpu) 209 { 210 int ret = cpu_online_unaligned_access_init(cpu); 211 212 if (ret) 213 return ret; 214 215 /* We are already set since the last check */ 216 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 217 goto exit; 218 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 219 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 220 goto exit; 221 } 222 223 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 224 { 225 static struct page *buf; 226 227 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 228 if (!buf) { 229 pr_warn("Allocation failure, not measuring misaligned performance\n"); 230 return -ENOMEM; 231 } 232 233 check_unaligned_access(buf); 234 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 235 } 236 #endif 237 238 exit: 239 modify_unaligned_access_branches(cpu_online_mask); 240 241 return 0; 242 } 243 244 static int riscv_offline_cpu(unsigned int cpu) 245 { 246 cpumask_t mask; 247 248 cpumask_copy(&mask, cpu_online_mask); 249 cpumask_clear_cpu(cpu, &mask); 250 251 modify_unaligned_access_branches(&mask); 252 253 return 0; 254 } 255 256 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 257 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 258 { 259 int cpu = smp_processor_id(); 260 struct page *page; 261 int ret; 262 263 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 264 return; 265 266 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 267 if (!page) { 268 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 269 return; 270 } 271 272 kernel_vector_begin(); 273 274 ret = compare_unaligned_access(__riscv_copy_vec_words_unaligned, 275 __riscv_copy_vec_bytes_unaligned, 276 page_address(page), "vector"); 277 kernel_vector_end(); 278 279 if (ret < 0) 280 goto free; 281 282 if (ret) 283 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 284 else 285 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 286 287 free: 288 __free_pages(page, MISALIGNED_BUFFER_ORDER); 289 } 290 291 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 292 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 293 { 294 schedule_on_each_cpu(check_vector_unaligned_access); 295 riscv_hwprobe_complete_async_probe(); 296 297 return 0; 298 } 299 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 300 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 301 { 302 return 0; 303 } 304 #endif 305 306 static int riscv_online_cpu_vec(unsigned int cpu) 307 { 308 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 309 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 310 return 0; 311 } 312 313 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 314 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 315 return 0; 316 317 check_vector_unaligned_access_emulated(NULL); 318 check_vector_unaligned_access(NULL); 319 #endif 320 321 return 0; 322 } 323 324 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; 325 326 static int __init set_unaligned_scalar_speed_param(char *str) 327 { 328 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) 329 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 330 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) 331 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 332 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) 333 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; 334 else 335 return -EINVAL; 336 337 return 1; 338 } 339 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); 340 341 static int __init set_unaligned_vector_speed_param(char *str) 342 { 343 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) 344 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 345 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) 346 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 347 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) 348 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 349 else 350 return -EINVAL; 351 352 return 1; 353 } 354 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); 355 356 static int __init check_unaligned_access_all_cpus(void) 357 { 358 int cpu; 359 360 unaligned_access_init(); 361 362 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 363 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", 364 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); 365 for_each_online_cpu(cpu) 366 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 367 } else if (!check_unaligned_access_emulated_all_cpus()) { 368 check_unaligned_access_speed_all_cpus(); 369 } 370 371 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 372 if (!has_vector() && 373 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { 374 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", 375 speed_str[unaligned_vector_speed_param]); 376 } else { 377 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", 378 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); 379 } 380 } 381 382 if (!has_vector()) 383 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 384 385 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 386 for_each_online_cpu(cpu) 387 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 388 } else if (!check_vector_unaligned_access_emulated_all_cpus() && 389 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 390 riscv_hwprobe_register_async_probe(); 391 if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus, 392 NULL, "vec_check_unaligned_access_speed_all_cpus"))) { 393 pr_warn("Failed to create vec_unalign_check kthread\n"); 394 riscv_hwprobe_complete_async_probe(); 395 } 396 } 397 398 /* 399 * Setup hotplug callbacks for any new CPUs that come online or go 400 * offline. 401 */ 402 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 403 riscv_online_cpu, riscv_offline_cpu); 404 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 405 riscv_online_cpu_vec, NULL); 406 407 cpus_read_lock(); 408 modify_unaligned_access_branches(cpu_online_mask); 409 cpus_read_unlock(); 410 411 return 0; 412 } 413 414 late_initcall(check_unaligned_access_all_cpus); 415