1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; 29 30 static cpumask_t fast_misaligned_access; 31 32 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 33 static int check_unaligned_access(void *param) 34 { 35 int cpu = smp_processor_id(); 36 u64 start_cycles, end_cycles; 37 u64 word_cycles; 38 u64 byte_cycles; 39 int ratio; 40 unsigned long start_jiffies, now; 41 struct page *page = param; 42 void *dst; 43 void *src; 44 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 45 46 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 47 return 0; 48 49 /* Make an unaligned destination buffer. */ 50 dst = (void *)((unsigned long)page_address(page) | 0x1); 51 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 52 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 53 src += 2; 54 word_cycles = -1ULL; 55 /* Do a warmup. */ 56 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 57 preempt_disable(); 58 start_jiffies = jiffies; 59 while ((now = jiffies) == start_jiffies) 60 cpu_relax(); 61 62 /* 63 * For a fixed amount of time, repeatedly try the function, and take 64 * the best time in cycles as the measurement. 65 */ 66 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 67 start_cycles = get_cycles64(); 68 /* Ensure the CSR read can't reorder WRT to the copy. */ 69 mb(); 70 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 71 /* Ensure the copy ends before the end time is snapped. */ 72 mb(); 73 end_cycles = get_cycles64(); 74 if ((end_cycles - start_cycles) < word_cycles) 75 word_cycles = end_cycles - start_cycles; 76 } 77 78 byte_cycles = -1ULL; 79 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 80 start_jiffies = jiffies; 81 while ((now = jiffies) == start_jiffies) 82 cpu_relax(); 83 84 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 85 start_cycles = get_cycles64(); 86 mb(); 87 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 88 mb(); 89 end_cycles = get_cycles64(); 90 if ((end_cycles - start_cycles) < byte_cycles) 91 byte_cycles = end_cycles - start_cycles; 92 } 93 94 preempt_enable(); 95 96 /* Don't divide by zero. */ 97 if (!word_cycles || !byte_cycles) { 98 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", 99 cpu); 100 101 return 0; 102 } 103 104 if (word_cycles < byte_cycles) 105 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 106 107 ratio = div_u64((byte_cycles * 100), word_cycles); 108 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", 109 cpu, 110 ratio / 100, 111 ratio % 100, 112 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); 113 114 per_cpu(misaligned_access_speed, cpu) = speed; 115 116 /* 117 * Set the value of fast_misaligned_access of a CPU. These operations 118 * are atomic to avoid race conditions. 119 */ 120 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) 121 cpumask_set_cpu(cpu, &fast_misaligned_access); 122 else 123 cpumask_clear_cpu(cpu, &fast_misaligned_access); 124 125 return 0; 126 } 127 128 static void __init check_unaligned_access_nonboot_cpu(void *param) 129 { 130 unsigned int cpu = smp_processor_id(); 131 struct page **pages = param; 132 133 if (smp_processor_id() != 0) 134 check_unaligned_access(pages[cpu]); 135 } 136 137 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 138 static void __init check_unaligned_access_speed_all_cpus(void) 139 { 140 unsigned int cpu; 141 unsigned int cpu_count = num_possible_cpus(); 142 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); 143 144 if (!bufs) { 145 pr_warn("Allocation failure, not measuring misaligned performance\n"); 146 return; 147 } 148 149 /* 150 * Allocate separate buffers for each CPU so there's no fighting over 151 * cache lines. 152 */ 153 for_each_cpu(cpu, cpu_online_mask) { 154 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 155 if (!bufs[cpu]) { 156 pr_warn("Allocation failure, not measuring misaligned performance\n"); 157 goto out; 158 } 159 } 160 161 /* Check everybody except 0, who stays behind to tend jiffies. */ 162 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); 163 164 /* Check core 0. */ 165 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); 166 167 out: 168 for_each_cpu(cpu, cpu_online_mask) { 169 if (bufs[cpu]) 170 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 171 } 172 173 kfree(bufs); 174 } 175 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 176 static void __init check_unaligned_access_speed_all_cpus(void) 177 { 178 } 179 #endif 180 181 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 182 183 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 184 { 185 if (cpumask_weight(mask) == weight) 186 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 187 else 188 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 189 } 190 191 static void set_unaligned_access_static_branches_except_cpu(int cpu) 192 { 193 /* 194 * Same as set_unaligned_access_static_branches, except excludes the 195 * given CPU from the result. When a CPU is hotplugged into an offline 196 * state, this function is called before the CPU is set to offline in 197 * the cpumask, and thus the CPU needs to be explicitly excluded. 198 */ 199 200 cpumask_t fast_except_me; 201 202 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 203 cpumask_clear_cpu(cpu, &fast_except_me); 204 205 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 206 } 207 208 static void set_unaligned_access_static_branches(void) 209 { 210 /* 211 * This will be called after check_unaligned_access_all_cpus so the 212 * result of unaligned access speed for all CPUs will be available. 213 * 214 * To avoid the number of online cpus changing between reading 215 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 216 * held before calling this function. 217 */ 218 219 cpumask_t fast_and_online; 220 221 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 222 223 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 224 } 225 226 static int __init lock_and_set_unaligned_access_static_branch(void) 227 { 228 cpus_read_lock(); 229 set_unaligned_access_static_branches(); 230 cpus_read_unlock(); 231 232 return 0; 233 } 234 235 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 236 237 static int riscv_online_cpu(unsigned int cpu) 238 { 239 int ret = cpu_online_unaligned_access_init(cpu); 240 241 if (ret) 242 return ret; 243 244 /* We are already set since the last check */ 245 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 246 goto exit; 247 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 248 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 249 goto exit; 250 } 251 252 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 253 { 254 static struct page *buf; 255 256 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 257 if (!buf) { 258 pr_warn("Allocation failure, not measuring misaligned performance\n"); 259 return -ENOMEM; 260 } 261 262 check_unaligned_access(buf); 263 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 264 } 265 #endif 266 267 exit: 268 set_unaligned_access_static_branches(); 269 270 return 0; 271 } 272 273 static int riscv_offline_cpu(unsigned int cpu) 274 { 275 set_unaligned_access_static_branches_except_cpu(cpu); 276 277 return 0; 278 } 279 280 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 281 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 282 { 283 int cpu = smp_processor_id(); 284 u64 start_cycles, end_cycles; 285 u64 word_cycles; 286 u64 byte_cycles; 287 int ratio; 288 unsigned long start_jiffies, now; 289 struct page *page; 290 void *dst; 291 void *src; 292 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 293 294 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 295 return; 296 297 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 298 if (!page) { 299 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 300 return; 301 } 302 303 /* Make an unaligned destination buffer. */ 304 dst = (void *)((unsigned long)page_address(page) | 0x1); 305 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 306 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 307 src += 2; 308 word_cycles = -1ULL; 309 310 /* Do a warmup. */ 311 kernel_vector_begin(); 312 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 313 314 start_jiffies = jiffies; 315 while ((now = jiffies) == start_jiffies) 316 cpu_relax(); 317 318 /* 319 * For a fixed amount of time, repeatedly try the function, and take 320 * the best time in cycles as the measurement. 321 */ 322 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 323 start_cycles = get_cycles64(); 324 /* Ensure the CSR read can't reorder WRT to the copy. */ 325 mb(); 326 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 327 /* Ensure the copy ends before the end time is snapped. */ 328 mb(); 329 end_cycles = get_cycles64(); 330 if ((end_cycles - start_cycles) < word_cycles) 331 word_cycles = end_cycles - start_cycles; 332 } 333 334 byte_cycles = -1ULL; 335 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 336 start_jiffies = jiffies; 337 while ((now = jiffies) == start_jiffies) 338 cpu_relax(); 339 340 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 341 start_cycles = get_cycles64(); 342 /* Ensure the CSR read can't reorder WRT to the copy. */ 343 mb(); 344 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 345 /* Ensure the copy ends before the end time is snapped. */ 346 mb(); 347 end_cycles = get_cycles64(); 348 if ((end_cycles - start_cycles) < byte_cycles) 349 byte_cycles = end_cycles - start_cycles; 350 } 351 352 kernel_vector_end(); 353 354 /* Don't divide by zero. */ 355 if (!word_cycles || !byte_cycles) { 356 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", 357 cpu); 358 359 goto free; 360 } 361 362 if (word_cycles < byte_cycles) 363 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 364 365 ratio = div_u64((byte_cycles * 100), word_cycles); 366 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", 367 cpu, 368 ratio / 100, 369 ratio % 100, 370 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); 371 372 per_cpu(vector_misaligned_access, cpu) = speed; 373 374 free: 375 __free_pages(page, MISALIGNED_BUFFER_ORDER); 376 } 377 378 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 379 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 380 { 381 schedule_on_each_cpu(check_vector_unaligned_access); 382 383 return 0; 384 } 385 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 386 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 387 { 388 return 0; 389 } 390 #endif 391 392 static int riscv_online_cpu_vec(unsigned int cpu) 393 { 394 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 395 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 396 return 0; 397 } 398 399 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 400 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 401 return 0; 402 403 check_vector_unaligned_access_emulated(NULL); 404 check_vector_unaligned_access(NULL); 405 #endif 406 407 return 0; 408 } 409 410 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; 411 412 static int __init set_unaligned_scalar_speed_param(char *str) 413 { 414 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) 415 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 416 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) 417 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 418 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) 419 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; 420 else 421 return -EINVAL; 422 423 return 1; 424 } 425 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); 426 427 static int __init set_unaligned_vector_speed_param(char *str) 428 { 429 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) 430 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 431 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) 432 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 433 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) 434 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 435 else 436 return -EINVAL; 437 438 return 1; 439 } 440 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); 441 442 static int __init check_unaligned_access_all_cpus(void) 443 { 444 int cpu; 445 446 unaligned_access_init(); 447 448 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 449 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", 450 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); 451 for_each_online_cpu(cpu) 452 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 453 } else if (!check_unaligned_access_emulated_all_cpus()) { 454 check_unaligned_access_speed_all_cpus(); 455 } 456 457 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 458 if (!has_vector() && 459 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { 460 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", 461 speed_str[unaligned_vector_speed_param]); 462 } else { 463 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", 464 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); 465 } 466 } 467 468 if (!has_vector()) 469 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 470 471 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 472 for_each_online_cpu(cpu) 473 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 474 } else if (!check_vector_unaligned_access_emulated_all_cpus() && 475 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 476 kthread_run(vec_check_unaligned_access_speed_all_cpus, 477 NULL, "vec_check_unaligned_access_speed_all_cpus"); 478 } 479 480 /* 481 * Setup hotplug callbacks for any new CPUs that come online or go 482 * offline. 483 */ 484 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 485 riscv_online_cpu, riscv_offline_cpu); 486 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 487 riscv_online_cpu_vec, NULL); 488 489 return 0; 490 } 491 492 arch_initcall(check_unaligned_access_all_cpus); 493