1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; 29 30 static cpumask_t fast_misaligned_access; 31 32 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 33 static int check_unaligned_access(void *param) 34 { 35 int cpu = smp_processor_id(); 36 u64 start_cycles, end_cycles; 37 u64 word_cycles; 38 u64 byte_cycles; 39 int ratio; 40 unsigned long start_jiffies, now; 41 struct page *page = param; 42 void *dst; 43 void *src; 44 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 45 46 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 47 return 0; 48 49 /* Make an unaligned destination buffer. */ 50 dst = (void *)((unsigned long)page_address(page) | 0x1); 51 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 52 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 53 src += 2; 54 word_cycles = -1ULL; 55 /* Do a warmup. */ 56 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 57 preempt_disable(); 58 start_jiffies = jiffies; 59 while ((now = jiffies) == start_jiffies) 60 cpu_relax(); 61 62 /* 63 * For a fixed amount of time, repeatedly try the function, and take 64 * the best time in cycles as the measurement. 65 */ 66 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 67 start_cycles = get_cycles64(); 68 /* Ensure the CSR read can't reorder WRT to the copy. */ 69 mb(); 70 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 71 /* Ensure the copy ends before the end time is snapped. */ 72 mb(); 73 end_cycles = get_cycles64(); 74 if ((end_cycles - start_cycles) < word_cycles) 75 word_cycles = end_cycles - start_cycles; 76 } 77 78 byte_cycles = -1ULL; 79 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 80 start_jiffies = jiffies; 81 while ((now = jiffies) == start_jiffies) 82 cpu_relax(); 83 84 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 85 start_cycles = get_cycles64(); 86 mb(); 87 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 88 mb(); 89 end_cycles = get_cycles64(); 90 if ((end_cycles - start_cycles) < byte_cycles) 91 byte_cycles = end_cycles - start_cycles; 92 } 93 94 preempt_enable(); 95 96 /* Don't divide by zero. */ 97 if (!word_cycles || !byte_cycles) { 98 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", 99 cpu); 100 101 return 0; 102 } 103 104 if (word_cycles < byte_cycles) 105 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 106 107 ratio = div_u64((byte_cycles * 100), word_cycles); 108 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", 109 cpu, 110 ratio / 100, 111 ratio % 100, 112 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); 113 114 per_cpu(misaligned_access_speed, cpu) = speed; 115 116 /* 117 * Set the value of fast_misaligned_access of a CPU. These operations 118 * are atomic to avoid race conditions. 119 */ 120 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) 121 cpumask_set_cpu(cpu, &fast_misaligned_access); 122 else 123 cpumask_clear_cpu(cpu, &fast_misaligned_access); 124 125 return 0; 126 } 127 128 static void __init check_unaligned_access_nonboot_cpu(void *param) 129 { 130 unsigned int cpu = smp_processor_id(); 131 struct page **pages = param; 132 133 if (smp_processor_id() != 0) 134 check_unaligned_access(pages[cpu]); 135 } 136 137 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 138 static void __init check_unaligned_access_speed_all_cpus(void) 139 { 140 unsigned int cpu; 141 unsigned int cpu_count = num_possible_cpus(); 142 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); 143 144 if (!bufs) { 145 pr_warn("Allocation failure, not measuring misaligned performance\n"); 146 return; 147 } 148 149 /* 150 * Allocate separate buffers for each CPU so there's no fighting over 151 * cache lines. 152 */ 153 for_each_cpu(cpu, cpu_online_mask) { 154 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 155 if (!bufs[cpu]) { 156 pr_warn("Allocation failure, not measuring misaligned performance\n"); 157 goto out; 158 } 159 } 160 161 /* Check everybody except 0, who stays behind to tend jiffies. */ 162 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); 163 164 /* Check core 0. */ 165 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); 166 167 out: 168 for_each_cpu(cpu, cpu_online_mask) { 169 if (bufs[cpu]) 170 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 171 } 172 173 kfree(bufs); 174 } 175 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 176 static void __init check_unaligned_access_speed_all_cpus(void) 177 { 178 } 179 #endif 180 181 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 182 183 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 184 { 185 if (cpumask_weight(mask) == weight) 186 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 187 else 188 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 189 } 190 191 static void set_unaligned_access_static_branches_except_cpu(int cpu) 192 { 193 /* 194 * Same as set_unaligned_access_static_branches, except excludes the 195 * given CPU from the result. When a CPU is hotplugged into an offline 196 * state, this function is called before the CPU is set to offline in 197 * the cpumask, and thus the CPU needs to be explicitly excluded. 198 */ 199 200 cpumask_t fast_except_me; 201 202 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 203 cpumask_clear_cpu(cpu, &fast_except_me); 204 205 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 206 } 207 208 static void set_unaligned_access_static_branches(void) 209 { 210 /* 211 * This will be called after check_unaligned_access_all_cpus so the 212 * result of unaligned access speed for all CPUs will be available. 213 * 214 * To avoid the number of online cpus changing between reading 215 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 216 * held before calling this function. 217 */ 218 219 cpumask_t fast_and_online; 220 221 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 222 223 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 224 } 225 226 static int __init lock_and_set_unaligned_access_static_branch(void) 227 { 228 cpus_read_lock(); 229 set_unaligned_access_static_branches(); 230 cpus_read_unlock(); 231 232 return 0; 233 } 234 235 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 236 237 static int riscv_online_cpu(unsigned int cpu) 238 { 239 /* We are already set since the last check */ 240 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 241 goto exit; 242 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 243 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 244 goto exit; 245 } 246 247 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 248 { 249 static struct page *buf; 250 251 check_unaligned_access_emulated(NULL); 252 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 253 if (!buf) { 254 pr_warn("Allocation failure, not measuring misaligned performance\n"); 255 return -ENOMEM; 256 } 257 258 check_unaligned_access(buf); 259 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 260 } 261 #endif 262 263 exit: 264 set_unaligned_access_static_branches(); 265 266 return 0; 267 } 268 269 static int riscv_offline_cpu(unsigned int cpu) 270 { 271 set_unaligned_access_static_branches_except_cpu(cpu); 272 273 return 0; 274 } 275 276 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 277 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 278 { 279 int cpu = smp_processor_id(); 280 u64 start_cycles, end_cycles; 281 u64 word_cycles; 282 u64 byte_cycles; 283 int ratio; 284 unsigned long start_jiffies, now; 285 struct page *page; 286 void *dst; 287 void *src; 288 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 289 290 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 291 return; 292 293 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 294 if (!page) { 295 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 296 return; 297 } 298 299 /* Make an unaligned destination buffer. */ 300 dst = (void *)((unsigned long)page_address(page) | 0x1); 301 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 302 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 303 src += 2; 304 word_cycles = -1ULL; 305 306 /* Do a warmup. */ 307 kernel_vector_begin(); 308 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 309 310 start_jiffies = jiffies; 311 while ((now = jiffies) == start_jiffies) 312 cpu_relax(); 313 314 /* 315 * For a fixed amount of time, repeatedly try the function, and take 316 * the best time in cycles as the measurement. 317 */ 318 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 319 start_cycles = get_cycles64(); 320 /* Ensure the CSR read can't reorder WRT to the copy. */ 321 mb(); 322 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 323 /* Ensure the copy ends before the end time is snapped. */ 324 mb(); 325 end_cycles = get_cycles64(); 326 if ((end_cycles - start_cycles) < word_cycles) 327 word_cycles = end_cycles - start_cycles; 328 } 329 330 byte_cycles = -1ULL; 331 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 332 start_jiffies = jiffies; 333 while ((now = jiffies) == start_jiffies) 334 cpu_relax(); 335 336 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 337 start_cycles = get_cycles64(); 338 /* Ensure the CSR read can't reorder WRT to the copy. */ 339 mb(); 340 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 341 /* Ensure the copy ends before the end time is snapped. */ 342 mb(); 343 end_cycles = get_cycles64(); 344 if ((end_cycles - start_cycles) < byte_cycles) 345 byte_cycles = end_cycles - start_cycles; 346 } 347 348 kernel_vector_end(); 349 350 /* Don't divide by zero. */ 351 if (!word_cycles || !byte_cycles) { 352 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", 353 cpu); 354 355 goto free; 356 } 357 358 if (word_cycles < byte_cycles) 359 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 360 361 ratio = div_u64((byte_cycles * 100), word_cycles); 362 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", 363 cpu, 364 ratio / 100, 365 ratio % 100, 366 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); 367 368 per_cpu(vector_misaligned_access, cpu) = speed; 369 370 free: 371 __free_pages(page, MISALIGNED_BUFFER_ORDER); 372 } 373 374 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 375 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 376 { 377 schedule_on_each_cpu(check_vector_unaligned_access); 378 379 return 0; 380 } 381 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 382 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 383 { 384 return 0; 385 } 386 #endif 387 388 static int riscv_online_cpu_vec(unsigned int cpu) 389 { 390 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 391 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 392 return 0; 393 } 394 395 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 396 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 397 return 0; 398 399 check_vector_unaligned_access_emulated(NULL); 400 check_vector_unaligned_access(NULL); 401 #endif 402 403 return 0; 404 } 405 406 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" }; 407 408 static int __init set_unaligned_scalar_speed_param(char *str) 409 { 410 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW])) 411 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 412 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST])) 413 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 414 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED])) 415 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED; 416 else 417 return -EINVAL; 418 419 return 1; 420 } 421 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param); 422 423 static int __init set_unaligned_vector_speed_param(char *str) 424 { 425 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW])) 426 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 427 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST])) 428 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 429 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED])) 430 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 431 else 432 return -EINVAL; 433 434 return 1; 435 } 436 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param); 437 438 static int __init check_unaligned_access_all_cpus(void) 439 { 440 int cpu; 441 442 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { 443 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", 444 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); 445 for_each_online_cpu(cpu) 446 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param; 447 } else if (!check_unaligned_access_emulated_all_cpus()) { 448 check_unaligned_access_speed_all_cpus(); 449 } 450 451 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 452 if (!has_vector() && 453 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) { 454 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n", 455 speed_str[unaligned_vector_speed_param]); 456 } else { 457 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n", 458 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param); 459 } 460 } 461 462 if (!has_vector()) 463 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 464 465 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { 466 for_each_online_cpu(cpu) 467 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param; 468 } else if (!check_vector_unaligned_access_emulated_all_cpus() && 469 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 470 kthread_run(vec_check_unaligned_access_speed_all_cpus, 471 NULL, "vec_check_unaligned_access_speed_all_cpus"); 472 } 473 474 /* 475 * Setup hotplug callbacks for any new CPUs that come online or go 476 * offline. 477 */ 478 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 479 riscv_online_cpu, riscv_offline_cpu); 480 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 481 riscv_online_cpu_vec, NULL); 482 483 return 0; 484 } 485 486 arch_initcall(check_unaligned_access_all_cpus); 487