1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; 29 30 static cpumask_t fast_misaligned_access; 31 32 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 33 static int check_unaligned_access(void *param) 34 { 35 int cpu = smp_processor_id(); 36 u64 start_cycles, end_cycles; 37 u64 word_cycles; 38 u64 byte_cycles; 39 int ratio; 40 unsigned long start_jiffies, now; 41 struct page *page = param; 42 void *dst; 43 void *src; 44 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 45 46 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 47 return 0; 48 49 /* Make an unaligned destination buffer. */ 50 dst = (void *)((unsigned long)page_address(page) | 0x1); 51 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 52 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 53 src += 2; 54 word_cycles = -1ULL; 55 /* Do a warmup. */ 56 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 57 preempt_disable(); 58 start_jiffies = jiffies; 59 while ((now = jiffies) == start_jiffies) 60 cpu_relax(); 61 62 /* 63 * For a fixed amount of time, repeatedly try the function, and take 64 * the best time in cycles as the measurement. 65 */ 66 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 67 start_cycles = get_cycles64(); 68 /* Ensure the CSR read can't reorder WRT to the copy. */ 69 mb(); 70 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 71 /* Ensure the copy ends before the end time is snapped. */ 72 mb(); 73 end_cycles = get_cycles64(); 74 if ((end_cycles - start_cycles) < word_cycles) 75 word_cycles = end_cycles - start_cycles; 76 } 77 78 byte_cycles = -1ULL; 79 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 80 start_jiffies = jiffies; 81 while ((now = jiffies) == start_jiffies) 82 cpu_relax(); 83 84 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 85 start_cycles = get_cycles64(); 86 mb(); 87 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 88 mb(); 89 end_cycles = get_cycles64(); 90 if ((end_cycles - start_cycles) < byte_cycles) 91 byte_cycles = end_cycles - start_cycles; 92 } 93 94 preempt_enable(); 95 96 /* Don't divide by zero. */ 97 if (!word_cycles || !byte_cycles) { 98 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", 99 cpu); 100 101 return 0; 102 } 103 104 if (word_cycles < byte_cycles) 105 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 106 107 ratio = div_u64((byte_cycles * 100), word_cycles); 108 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", 109 cpu, 110 ratio / 100, 111 ratio % 100, 112 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); 113 114 per_cpu(misaligned_access_speed, cpu) = speed; 115 116 /* 117 * Set the value of fast_misaligned_access of a CPU. These operations 118 * are atomic to avoid race conditions. 119 */ 120 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) 121 cpumask_set_cpu(cpu, &fast_misaligned_access); 122 else 123 cpumask_clear_cpu(cpu, &fast_misaligned_access); 124 125 return 0; 126 } 127 128 static void __init check_unaligned_access_nonboot_cpu(void *param) 129 { 130 unsigned int cpu = smp_processor_id(); 131 struct page **pages = param; 132 133 if (smp_processor_id() != 0) 134 check_unaligned_access(pages[cpu]); 135 } 136 137 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 138 static void __init check_unaligned_access_speed_all_cpus(void) 139 { 140 unsigned int cpu; 141 unsigned int cpu_count = num_possible_cpus(); 142 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); 143 144 if (!bufs) { 145 pr_warn("Allocation failure, not measuring misaligned performance\n"); 146 return; 147 } 148 149 /* 150 * Allocate separate buffers for each CPU so there's no fighting over 151 * cache lines. 152 */ 153 for_each_cpu(cpu, cpu_online_mask) { 154 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 155 if (!bufs[cpu]) { 156 pr_warn("Allocation failure, not measuring misaligned performance\n"); 157 goto out; 158 } 159 } 160 161 /* Check everybody except 0, who stays behind to tend jiffies. */ 162 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); 163 164 /* Check core 0. */ 165 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); 166 167 out: 168 for_each_cpu(cpu, cpu_online_mask) { 169 if (bufs[cpu]) 170 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 171 } 172 173 kfree(bufs); 174 } 175 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 176 static void __init check_unaligned_access_speed_all_cpus(void) 177 { 178 } 179 #endif 180 181 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 182 183 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 184 { 185 if (cpumask_weight(mask) == weight) 186 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 187 else 188 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 189 } 190 191 static void set_unaligned_access_static_branches_except_cpu(int cpu) 192 { 193 /* 194 * Same as set_unaligned_access_static_branches, except excludes the 195 * given CPU from the result. When a CPU is hotplugged into an offline 196 * state, this function is called before the CPU is set to offline in 197 * the cpumask, and thus the CPU needs to be explicitly excluded. 198 */ 199 200 cpumask_t fast_except_me; 201 202 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 203 cpumask_clear_cpu(cpu, &fast_except_me); 204 205 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 206 } 207 208 static void set_unaligned_access_static_branches(void) 209 { 210 /* 211 * This will be called after check_unaligned_access_all_cpus so the 212 * result of unaligned access speed for all CPUs will be available. 213 * 214 * To avoid the number of online cpus changing between reading 215 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 216 * held before calling this function. 217 */ 218 219 cpumask_t fast_and_online; 220 221 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 222 223 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 224 } 225 226 static int __init lock_and_set_unaligned_access_static_branch(void) 227 { 228 cpus_read_lock(); 229 set_unaligned_access_static_branches(); 230 cpus_read_unlock(); 231 232 return 0; 233 } 234 235 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 236 237 static int riscv_online_cpu(unsigned int cpu) 238 { 239 int ret = cpu_online_unaligned_access_init(cpu); 240 241 if (ret) 242 return ret; 243 244 /* We are already set since the last check */ 245 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 246 goto exit; 247 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 248 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 249 goto exit; 250 } 251 252 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 253 { 254 static struct page *buf; 255 256 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 257 if (!buf) { 258 pr_warn("Allocation failure, not measuring misaligned performance\n"); 259 return -ENOMEM; 260 } 261 262 check_unaligned_access(buf); 263 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 264 } 265 #endif 266 267 exit: 268 set_unaligned_access_static_branches(); 269 270 return 0; 271 } 272 273 static int riscv_offline_cpu(unsigned int cpu) 274 { 275 set_unaligned_access_static_branches_except_cpu(cpu); 276 277 return 0; 278 } 279 280 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 281 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 282 { 283 int cpu = smp_processor_id(); 284 u64 start_cycles, end_cycles; 285 u64 word_cycles; 286 u64 byte_cycles; 287 int ratio; 288 unsigned long start_jiffies, now; 289 struct page *page; 290 void *dst; 291 void *src; 292 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 293 294 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 295 return; 296 297 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 298 if (!page) { 299 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 300 return; 301 } 302 303 /* Make an unaligned destination buffer. */ 304 dst = (void *)((unsigned long)page_address(page) | 0x1); 305 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 306 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 307 src += 2; 308 word_cycles = -1ULL; 309 310 /* Do a warmup. */ 311 kernel_vector_begin(); 312 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 313 314 start_jiffies = jiffies; 315 while ((now = jiffies) == start_jiffies) 316 cpu_relax(); 317 318 /* 319 * For a fixed amount of time, repeatedly try the function, and take 320 * the best time in cycles as the measurement. 321 */ 322 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 323 start_cycles = get_cycles64(); 324 /* Ensure the CSR read can't reorder WRT to the copy. */ 325 mb(); 326 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 327 /* Ensure the copy ends before the end time is snapped. */ 328 mb(); 329 end_cycles = get_cycles64(); 330 if ((end_cycles - start_cycles) < word_cycles) 331 word_cycles = end_cycles - start_cycles; 332 } 333 334 byte_cycles = -1ULL; 335 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 336 start_jiffies = jiffies; 337 while ((now = jiffies) == start_jiffies) 338 cpu_relax(); 339 340 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 341 start_cycles = get_cycles64(); 342 /* Ensure the CSR read can't reorder WRT to the copy. */ 343 mb(); 344 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 345 /* Ensure the copy ends before the end time is snapped. */ 346 mb(); 347 end_cycles = get_cycles64(); 348 if ((end_cycles - start_cycles) < byte_cycles) 349 byte_cycles = end_cycles - start_cycles; 350 } 351 352 kernel_vector_end(); 353 354 /* Don't divide by zero. */ 355 if (!word_cycles || !byte_cycles) { 356 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", 357 cpu); 358 359 goto free; 360 } 361 362 if (word_cycles < byte_cycles) 363 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 364 365 ratio = div_u64((byte_cycles * 100), word_cycles); 366 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", 367 cpu, 368 ratio / 100, 369 ratio % 100, 370 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); 371 372 per_cpu(vector_misaligned_access, cpu) = speed; 373 374 free: 375 __free_pages(page, MISALIGNED_BUFFER_ORDER); 376 } 377 378 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 379 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 380 { 381 schedule_on_each_cpu(check_vector_unaligned_access); 382 riscv_hwprobe_complete_async_probe(); 383 384 return 0; 385 } 386 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 387 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 388 { 389 return 0; 390 } 391 #endif 392 393 static int riscv_online_cpu_vec(unsigned int cpu) 394 { 395 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 396 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 397 return 0; 398 } 399 400 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 401 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 402 return 0; 403 404 check_vector_unaligned_access_emulated(NULL); 405 check_vector_unaligned_access(NULL); 406 #endif 407 408 return 0; 409 } 410 411 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; 412 413 static int __init set_unaligned_scalar_speed_param(char *str) 414 { 415 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) 416 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 417 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) 418 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 419 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) 420 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; 421 else 422 return -EINVAL; 423 424 return 1; 425 } 426 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); 427 428 static int __init set_unaligned_vector_speed_param(char *str) 429 { 430 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) 431 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 432 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) 433 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 434 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) 435 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 436 else 437 return -EINVAL; 438 439 return 1; 440 } 441 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); 442 443 static int __init check_unaligned_access_all_cpus(void) 444 { 445 int cpu; 446 447 unaligned_access_init(); 448 449 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 450 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", 451 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); 452 for_each_online_cpu(cpu) 453 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 454 } else if (!check_unaligned_access_emulated_all_cpus()) { 455 check_unaligned_access_speed_all_cpus(); 456 } 457 458 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 459 if (!has_vector() && 460 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { 461 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", 462 speed_str[unaligned_vector_speed_param]); 463 } else { 464 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", 465 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); 466 } 467 } 468 469 if (!has_vector()) 470 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 471 472 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 473 for_each_online_cpu(cpu) 474 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 475 } else if (!check_vector_unaligned_access_emulated_all_cpus() && 476 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 477 riscv_hwprobe_register_async_probe(); 478 if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus, 479 NULL, "vec_check_unaligned_access_speed_all_cpus"))) { 480 pr_warn("Failed to create vec_unalign_check kthread\n"); 481 riscv_hwprobe_complete_async_probe(); 482 } 483 } 484 485 /* 486 * Setup hotplug callbacks for any new CPUs that come online or go 487 * offline. 488 */ 489 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 490 riscv_online_cpu, riscv_offline_cpu); 491 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 492 riscv_online_cpu_vec, NULL); 493 494 return 0; 495 } 496 497 arch_initcall(check_unaligned_access_all_cpus); 498