1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright 2024 Rivos Inc. 4 */ 5 6 #include <linux/cpu.h> 7 #include <linux/cpumask.h> 8 #include <linux/jump_label.h> 9 #include <linux/kthread.h> 10 #include <linux/mm.h> 11 #include <linux/smp.h> 12 #include <linux/types.h> 13 #include <asm/cpufeature.h> 14 #include <asm/hwprobe.h> 15 #include <asm/vector.h> 16 17 #include "copy-unaligned.h" 18 19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1 20 #define MISALIGNED_BUFFER_SIZE 0x4000 21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) 22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) 23 24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; 25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; 26 27 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS 28 static cpumask_t fast_misaligned_access; 29 static int check_unaligned_access(void *param) 30 { 31 int cpu = smp_processor_id(); 32 u64 start_cycles, end_cycles; 33 u64 word_cycles; 34 u64 byte_cycles; 35 int ratio; 36 unsigned long start_jiffies, now; 37 struct page *page = param; 38 void *dst; 39 void *src; 40 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; 41 42 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 43 return 0; 44 45 /* Make an unaligned destination buffer. */ 46 dst = (void *)((unsigned long)page_address(page) | 0x1); 47 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 48 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 49 src += 2; 50 word_cycles = -1ULL; 51 /* Do a warmup. */ 52 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 53 preempt_disable(); 54 start_jiffies = jiffies; 55 while ((now = jiffies) == start_jiffies) 56 cpu_relax(); 57 58 /* 59 * For a fixed amount of time, repeatedly try the function, and take 60 * the best time in cycles as the measurement. 61 */ 62 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 63 start_cycles = get_cycles64(); 64 /* Ensure the CSR read can't reorder WRT to the copy. */ 65 mb(); 66 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 67 /* Ensure the copy ends before the end time is snapped. */ 68 mb(); 69 end_cycles = get_cycles64(); 70 if ((end_cycles - start_cycles) < word_cycles) 71 word_cycles = end_cycles - start_cycles; 72 } 73 74 byte_cycles = -1ULL; 75 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 76 start_jiffies = jiffies; 77 while ((now = jiffies) == start_jiffies) 78 cpu_relax(); 79 80 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 81 start_cycles = get_cycles64(); 82 mb(); 83 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 84 mb(); 85 end_cycles = get_cycles64(); 86 if ((end_cycles - start_cycles) < byte_cycles) 87 byte_cycles = end_cycles - start_cycles; 88 } 89 90 preempt_enable(); 91 92 /* Don't divide by zero. */ 93 if (!word_cycles || !byte_cycles) { 94 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", 95 cpu); 96 97 return 0; 98 } 99 100 if (word_cycles < byte_cycles) 101 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; 102 103 ratio = div_u64((byte_cycles * 100), word_cycles); 104 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", 105 cpu, 106 ratio / 100, 107 ratio % 100, 108 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow"); 109 110 per_cpu(misaligned_access_speed, cpu) = speed; 111 112 /* 113 * Set the value of fast_misaligned_access of a CPU. These operations 114 * are atomic to avoid race conditions. 115 */ 116 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) 117 cpumask_set_cpu(cpu, &fast_misaligned_access); 118 else 119 cpumask_clear_cpu(cpu, &fast_misaligned_access); 120 121 return 0; 122 } 123 124 static void check_unaligned_access_nonboot_cpu(void *param) 125 { 126 unsigned int cpu = smp_processor_id(); 127 struct page **pages = param; 128 129 if (smp_processor_id() != 0) 130 check_unaligned_access(pages[cpu]); 131 } 132 133 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); 134 135 static void modify_unaligned_access_branches(cpumask_t *mask, int weight) 136 { 137 if (cpumask_weight(mask) == weight) 138 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); 139 else 140 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); 141 } 142 143 static void set_unaligned_access_static_branches_except_cpu(int cpu) 144 { 145 /* 146 * Same as set_unaligned_access_static_branches, except excludes the 147 * given CPU from the result. When a CPU is hotplugged into an offline 148 * state, this function is called before the CPU is set to offline in 149 * the cpumask, and thus the CPU needs to be explicitly excluded. 150 */ 151 152 cpumask_t fast_except_me; 153 154 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); 155 cpumask_clear_cpu(cpu, &fast_except_me); 156 157 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); 158 } 159 160 static void set_unaligned_access_static_branches(void) 161 { 162 /* 163 * This will be called after check_unaligned_access_all_cpus so the 164 * result of unaligned access speed for all CPUs will be available. 165 * 166 * To avoid the number of online cpus changing between reading 167 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be 168 * held before calling this function. 169 */ 170 171 cpumask_t fast_and_online; 172 173 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); 174 175 modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); 176 } 177 178 static int lock_and_set_unaligned_access_static_branch(void) 179 { 180 cpus_read_lock(); 181 set_unaligned_access_static_branches(); 182 cpus_read_unlock(); 183 184 return 0; 185 } 186 187 arch_initcall_sync(lock_and_set_unaligned_access_static_branch); 188 189 static int riscv_online_cpu(unsigned int cpu) 190 { 191 static struct page *buf; 192 193 /* We are already set since the last check */ 194 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) 195 goto exit; 196 197 check_unaligned_access_emulated(NULL); 198 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 199 if (!buf) { 200 pr_warn("Allocation failure, not measuring misaligned performance\n"); 201 return -ENOMEM; 202 } 203 204 check_unaligned_access(buf); 205 __free_pages(buf, MISALIGNED_BUFFER_ORDER); 206 207 exit: 208 set_unaligned_access_static_branches(); 209 210 return 0; 211 } 212 213 static int riscv_offline_cpu(unsigned int cpu) 214 { 215 set_unaligned_access_static_branches_except_cpu(cpu); 216 217 return 0; 218 } 219 220 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 221 static int check_unaligned_access_speed_all_cpus(void) 222 { 223 unsigned int cpu; 224 unsigned int cpu_count = num_possible_cpus(); 225 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); 226 227 if (!bufs) { 228 pr_warn("Allocation failure, not measuring misaligned performance\n"); 229 return 0; 230 } 231 232 /* 233 * Allocate separate buffers for each CPU so there's no fighting over 234 * cache lines. 235 */ 236 for_each_cpu(cpu, cpu_online_mask) { 237 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 238 if (!bufs[cpu]) { 239 pr_warn("Allocation failure, not measuring misaligned performance\n"); 240 goto out; 241 } 242 } 243 244 /* Check everybody except 0, who stays behind to tend jiffies. */ 245 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); 246 247 /* Check core 0. */ 248 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); 249 250 /* 251 * Setup hotplug callbacks for any new CPUs that come online or go 252 * offline. 253 */ 254 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 255 riscv_online_cpu, riscv_offline_cpu); 256 257 out: 258 for_each_cpu(cpu, cpu_online_mask) { 259 if (bufs[cpu]) 260 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); 261 } 262 263 kfree(bufs); 264 return 0; 265 } 266 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ 267 static int check_unaligned_access_speed_all_cpus(void) 268 { 269 return 0; 270 } 271 #endif 272 273 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS 274 static void check_vector_unaligned_access(struct work_struct *work __always_unused) 275 { 276 int cpu = smp_processor_id(); 277 u64 start_cycles, end_cycles; 278 u64 word_cycles; 279 u64 byte_cycles; 280 int ratio; 281 unsigned long start_jiffies, now; 282 struct page *page; 283 void *dst; 284 void *src; 285 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; 286 287 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) 288 return; 289 290 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); 291 if (!page) { 292 pr_warn("Allocation failure, not measuring vector misaligned performance\n"); 293 return; 294 } 295 296 /* Make an unaligned destination buffer. */ 297 dst = (void *)((unsigned long)page_address(page) | 0x1); 298 /* Unalign src as well, but differently (off by 1 + 2 = 3). */ 299 src = dst + (MISALIGNED_BUFFER_SIZE / 2); 300 src += 2; 301 word_cycles = -1ULL; 302 303 /* Do a warmup. */ 304 kernel_vector_begin(); 305 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 306 307 start_jiffies = jiffies; 308 while ((now = jiffies) == start_jiffies) 309 cpu_relax(); 310 311 /* 312 * For a fixed amount of time, repeatedly try the function, and take 313 * the best time in cycles as the measurement. 314 */ 315 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 316 start_cycles = get_cycles64(); 317 /* Ensure the CSR read can't reorder WRT to the copy. */ 318 mb(); 319 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); 320 /* Ensure the copy ends before the end time is snapped. */ 321 mb(); 322 end_cycles = get_cycles64(); 323 if ((end_cycles - start_cycles) < word_cycles) 324 word_cycles = end_cycles - start_cycles; 325 } 326 327 byte_cycles = -1ULL; 328 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 329 start_jiffies = jiffies; 330 while ((now = jiffies) == start_jiffies) 331 cpu_relax(); 332 333 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { 334 start_cycles = get_cycles64(); 335 /* Ensure the CSR read can't reorder WRT to the copy. */ 336 mb(); 337 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); 338 /* Ensure the copy ends before the end time is snapped. */ 339 mb(); 340 end_cycles = get_cycles64(); 341 if ((end_cycles - start_cycles) < byte_cycles) 342 byte_cycles = end_cycles - start_cycles; 343 } 344 345 kernel_vector_end(); 346 347 /* Don't divide by zero. */ 348 if (!word_cycles || !byte_cycles) { 349 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", 350 cpu); 351 352 return; 353 } 354 355 if (word_cycles < byte_cycles) 356 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; 357 358 ratio = div_u64((byte_cycles * 100), word_cycles); 359 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", 360 cpu, 361 ratio / 100, 362 ratio % 100, 363 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); 364 365 per_cpu(vector_misaligned_access, cpu) = speed; 366 } 367 368 static int riscv_online_cpu_vec(unsigned int cpu) 369 { 370 if (!has_vector()) 371 return 0; 372 373 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) 374 return 0; 375 376 check_vector_unaligned_access_emulated(NULL); 377 check_vector_unaligned_access(NULL); 378 return 0; 379 } 380 381 /* Measure unaligned access speed on all CPUs present at boot in parallel. */ 382 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 383 { 384 schedule_on_each_cpu(check_vector_unaligned_access); 385 386 /* 387 * Setup hotplug callbacks for any new CPUs that come online or go 388 * offline. 389 */ 390 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", 391 riscv_online_cpu_vec, NULL); 392 393 return 0; 394 } 395 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ 396 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) 397 { 398 return 0; 399 } 400 #endif 401 402 static int check_unaligned_access_all_cpus(void) 403 { 404 bool all_cpus_emulated, all_cpus_vec_unsupported; 405 406 all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); 407 all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus(); 408 409 if (!all_cpus_vec_unsupported && 410 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { 411 kthread_run(vec_check_unaligned_access_speed_all_cpus, 412 NULL, "vec_check_unaligned_access_speed_all_cpus"); 413 } 414 415 if (!all_cpus_emulated) 416 return check_unaligned_access_speed_all_cpus(); 417 418 return 0; 419 } 420 421 arch_initcall(check_unaligned_access_all_cpus); 422