1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_NS 8000000 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; 29 30 static cpumask_t fast_misaligned_access; 31 32 static u64 __maybe_unused 33 measure_cycles(void (*func)(void *dst, const void *src, size_t len), 34 void *dst, void *src, size_t len) 35 { 36 u64 start_cycles, end_cycles, cycles = -1ULL; 37 u64 start_ns; 38 39 /* Do a warmup. */ 40 func(dst, src, len); 41 42 preempt_disable(); 43 44 /* 45 * For a fixed amount of time, repeatedly try the function, and take 46 * the best time in cycles as the measurement. 47 */ 48 start_ns = ktime_get_mono_fast_ns(); 49 while (ktime_get_mono_fast_ns() < start_ns + MISALIGNED_ACCESS_NS) { 50 start_cycles = get_cycles64(); 51 /* Ensure the CSR read can't reorder WRT to the copy. */ 52 mb(); 53 func(dst, src, len); 54 /* Ensure the copy ends before the end time is snapped. */ 55 mb(); 56 end_cycles = get_cycles64(); 57 if ((end_cycles - start_cycles) < cycles) 58 cycles = end_cycles - start_cycles; 59 } 60 61 preempt_enable(); 62 63 return cycles; 64 } 65 66 /* 67 * Return: 68 * 1 if unaligned accesses are fast 69 * 0 if unaligned accesses are slow 70 * -1 if check cannot be done 71 */ 72 static int __maybe_unused 73 compare_unaligned_access(void (*word_copy)(void *dst, const void *src, size_t len), 74 void (*byte_copy)(void *dst, const void *src, size_t len), 75 void *buf, const char *type) 76 { 77 int cpu = smp_processor_id(); 78 u64 word_cycles; 79 u64 byte_cycles; 80 void *dst, *src; 81 bool fast; 82 int ratio; 83 84 /* Make an unaligned destination buffer. */ 85 dst = (void *)((unsigned long)buf | 0x1); 86 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 87 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 88 src += 2; 89 90 word_cycles = measure_cycles(word_copy, dst, src, MISALIGNED_COPY_SIZE); 91 byte_cycles = measure_cycles(byte_copy, dst, src, MISALIGNED_COPY_SIZE); 92 93 /* Don't divide by zero. */ 94 if (!word_cycles || !byte_cycles) { 95 pr_warn("cpu%d: rdtime lacks granularity needed to measure %s unaligned access speed\n", 96 cpu, type); 97 98 return -1; 99 } 100 101 fast = word_cycles < byte_cycles; 102 103 ratio = div_u64((byte_cycles * 100), word_cycles); 104 pr_info("cpu%d: %s unaligned word access speed is %d.%02dx byte access speed (%s)\n", 105 cpu, 106 type, 107 ratio / 100, 108 ratio % 100, 109 fast ? "fast" : "slow"); 110 111 return fast; 112 } 113 114 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 115 static int check_unaligned_access(struct page *page) 116 { 117 void *buf = page_address(page); 118 int cpu = smp_processor_id(); 119 int ret; 120 121 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 122 return 0; 123 124 ret = compare_unaligned_access(__riscv_copy_words_unaligned, 125 __riscv_copy_bytes_unaligned, 126 buf, "scalar"); 127 if (ret < 0) 128 return 0; 129 130 /* 131 * Set the value of fast_misaligned_access of a CPU. These operations 132 * are atomic to avoid race conditions. 133 */ 134 if (ret) { 135 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 136 cpumask_set_cpu(cpu, &fast_misaligned_access); 137 } else { 138 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 139 cpumask_clear_cpu(cpu, &fast_misaligned_access); 140 } 141 142 return 0; 143 } 144 145 static void __init _check_unaligned_access(void *param) 146 { 147 unsigned int cpu = smp_processor_id(); 148 struct page **pages = param; 149 150 check_unaligned_access(pages[cpu]); 151 } 152 153 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 154 static void __init check_unaligned_access_speed_all_cpus(void) 155 { 156 unsigned int cpu; 157 unsigned int cpu_count = num_possible_cpus(); 158 struct page **bufs = kzalloc_objs(*bufs, cpu_count); 159 160 if (!bufs) { 161 pr_warn("Allocation failure, not measuring misaligned performance\n"); 162 return; 163 } 164 165 /* 166 * Allocate separate buffers for each CPU so there's no fighting over 167 * cache lines. 168 */ 169 for_each_cpu(cpu, cpu_online_mask) { 170 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 171 if (!bufs[cpu]) { 172 pr_warn("Allocation failure, not measuring misaligned performance\n"); 173 goto out; 174 } 175 } 176 177 on_each_cpu(_check_unaligned_access, bufs, 1); 178 179 out: 180 for_each_cpu(cpu, cpu_online_mask) { 181 if (bufs[cpu]) 182 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 183 } 184 185 kfree(bufs); 186 } 187 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 188 static void __init check_unaligned_access_speed_all_cpus(void) 189 { 190 } 191 #endif 192 193 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 194 195 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 196 { 197 if (cpumask_weight(mask) == weight) 198 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 199 else 200 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 201 } 202 203 static void set_unaligned_access_static_branches_except_cpu(int cpu) 204 { 205 /* 206 * Same as set_unaligned_access_static_branches, except excludes the 207 * given CPU from the result. When a CPU is hotplugged into an offline 208 * state, this function is called before the CPU is set to offline in 209 * the cpumask, and thus the CPU needs to be explicitly excluded. 210 */ 211 212 cpumask_t fast_except_me; 213 214 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 215 cpumask_clear_cpu(cpu, &fast_except_me); 216 217 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 218 } 219 220 static void set_unaligned_access_static_branches(void) 221 { 222 /* 223 * This will be called after check_unaligned_access_all_cpus so the 224 * result of unaligned access speed for all CPUs will be available. 225 * 226 * To avoid the number of online cpus changing between reading 227 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 228 * held before calling this function. 229 */ 230 231 cpumask_t fast_and_online; 232 233 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 234 235 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 236 } 237 238 static int __init lock_and_set_unaligned_access_static_branch(void) 239 { 240 cpus_read_lock(); 241 set_unaligned_access_static_branches(); 242 cpus_read_unlock(); 243 244 return 0; 245 } 246 247 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 248 249 static int riscv_online_cpu(unsigned int cpu) 250 { 251 int ret = cpu_online_unaligned_access_init(cpu); 252 253 if (ret) 254 return ret; 255 256 /* We are already set since the last check */ 257 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 258 goto exit; 259 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 260 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 261 goto exit; 262 } 263 264 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 265 { 266 static struct page *buf; 267 268 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 269 if (!buf) { 270 pr_warn("Allocation failure, not measuring misaligned performance\n"); 271 return -ENOMEM; 272 } 273 274 check_unaligned_access(buf); 275 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 276 } 277 #endif 278 279 exit: 280 set_unaligned_access_static_branches(); 281 282 return 0; 283 } 284 285 static int riscv_offline_cpu(unsigned int cpu) 286 { 287 set_unaligned_access_static_branches_except_cpu(cpu); 288 289 return 0; 290 } 291 292 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 293 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 294 { 295 int cpu = smp_processor_id(); 296 struct page *page; 297 int ret; 298 299 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 300 return; 301 302 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 303 if (!page) { 304 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 305 return; 306 } 307 308 kernel_vector_begin(); 309 310 ret = compare_unaligned_access(__riscv_copy_vec_words_unaligned, 311 __riscv_copy_vec_bytes_unaligned, 312 page_address(page), "vector"); 313 kernel_vector_end(); 314 315 if (ret < 0) 316 goto free; 317 318 if (ret) 319 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 320 else 321 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 322 323 free: 324 __free_pages(page, MISALIGNED_BUFFER_ORDER); 325 } 326 327 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 328 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 329 { 330 schedule_on_each_cpu(check_vector_unaligned_access); 331 riscv_hwprobe_complete_async_probe(); 332 333 return 0; 334 } 335 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 336 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 337 { 338 return 0; 339 } 340 #endif 341 342 static int riscv_online_cpu_vec(unsigned int cpu) 343 { 344 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 345 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 346 return 0; 347 } 348 349 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 350 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 351 return 0; 352 353 check_vector_unaligned_access_emulated(NULL); 354 check_vector_unaligned_access(NULL); 355 #endif 356 357 return 0; 358 } 359 360 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; 361 362 static int __init set_unaligned_scalar_speed_param(char *str) 363 { 364 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) 365 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 366 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) 367 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 368 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) 369 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; 370 else 371 return -EINVAL; 372 373 return 1; 374 } 375 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); 376 377 static int __init set_unaligned_vector_speed_param(char *str) 378 { 379 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) 380 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 381 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) 382 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 383 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) 384 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 385 else 386 return -EINVAL; 387 388 return 1; 389 } 390 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); 391 392 static int __init check_unaligned_access_all_cpus(void) 393 { 394 int cpu; 395 396 unaligned_access_init(); 397 398 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 399 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", 400 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); 401 for_each_online_cpu(cpu) 402 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 403 } else if (!check_unaligned_access_emulated_all_cpus()) { 404 check_unaligned_access_speed_all_cpus(); 405 } 406 407 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 408 if (!has_vector() && 409 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { 410 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", 411 speed_str[unaligned_vector_speed_param]); 412 } else { 413 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", 414 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); 415 } 416 } 417 418 if (!has_vector()) 419 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 420 421 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 422 for_each_online_cpu(cpu) 423 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 424 } else if (!check_vector_unaligned_access_emulated_all_cpus() && 425 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 426 riscv_hwprobe_register_async_probe(); 427 if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus, 428 NULL, "vec_check_unaligned_access_speed_all_cpus"))) { 429 pr_warn("Failed to create vec_unalign_check kthread\n"); 430 riscv_hwprobe_complete_async_probe(); 431 } 432 } 433 434 /* 435 * Setup hotplug callbacks for any new CPUs that come online or go 436 * offline. 437 */ 438 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 439 riscv_online_cpu, riscv_offline_cpu); 440 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 441 riscv_online_cpu_vec, NULL); 442 443 return 0; 444 } 445 446 late_initcall(check_unaligned_access_all_cpus); 447