1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2024 Rivos Inc.
4 */
5
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16
17 #include "copy-unaligned.h"
18
19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26
27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
29
30 static cpumask_t fast_misaligned_access;
31
32 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
check_unaligned_access(void * param)33 static int check_unaligned_access(void *param)
34 {
35 int cpu = smp_processor_id();
36 u64 start_cycles, end_cycles;
37 u64 word_cycles;
38 u64 byte_cycles;
39 int ratio;
40 unsigned long start_jiffies, now;
41 struct page *page = param;
42 void *dst;
43 void *src;
44 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
45
46 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
47 return 0;
48
49 /* Make an unaligned destination buffer. */
50 dst = (void *)((unsigned long)page_address(page) | 0x1);
51 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
52 src = dst + (MISALIGNED_BUFFER_SIZE / 2);
53 src += 2;
54 word_cycles = -1ULL;
55 /* Do a warmup. */
56 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
57 preempt_disable();
58 start_jiffies = jiffies;
59 while ((now = jiffies) == start_jiffies)
60 cpu_relax();
61
62 /*
63 * For a fixed amount of time, repeatedly try the function, and take
64 * the best time in cycles as the measurement.
65 */
66 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
67 start_cycles = get_cycles64();
68 /* Ensure the CSR read can't reorder WRT to the copy. */
69 mb();
70 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
71 /* Ensure the copy ends before the end time is snapped. */
72 mb();
73 end_cycles = get_cycles64();
74 if ((end_cycles - start_cycles) < word_cycles)
75 word_cycles = end_cycles - start_cycles;
76 }
77
78 byte_cycles = -1ULL;
79 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
80 start_jiffies = jiffies;
81 while ((now = jiffies) == start_jiffies)
82 cpu_relax();
83
84 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
85 start_cycles = get_cycles64();
86 mb();
87 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
88 mb();
89 end_cycles = get_cycles64();
90 if ((end_cycles - start_cycles) < byte_cycles)
91 byte_cycles = end_cycles - start_cycles;
92 }
93
94 preempt_enable();
95
96 /* Don't divide by zero. */
97 if (!word_cycles || !byte_cycles) {
98 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
99 cpu);
100
101 return 0;
102 }
103
104 if (word_cycles < byte_cycles)
105 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
106
107 ratio = div_u64((byte_cycles * 100), word_cycles);
108 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
109 cpu,
110 ratio / 100,
111 ratio % 100,
112 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
113
114 per_cpu(misaligned_access_speed, cpu) = speed;
115
116 /*
117 * Set the value of fast_misaligned_access of a CPU. These operations
118 * are atomic to avoid race conditions.
119 */
120 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
121 cpumask_set_cpu(cpu, &fast_misaligned_access);
122 else
123 cpumask_clear_cpu(cpu, &fast_misaligned_access);
124
125 return 0;
126 }
127
check_unaligned_access_nonboot_cpu(void * param)128 static void __init check_unaligned_access_nonboot_cpu(void *param)
129 {
130 unsigned int cpu = smp_processor_id();
131 struct page **pages = param;
132
133 if (smp_processor_id() != 0)
134 check_unaligned_access(pages[cpu]);
135 }
136
137 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
check_unaligned_access_speed_all_cpus(void)138 static void __init check_unaligned_access_speed_all_cpus(void)
139 {
140 unsigned int cpu;
141 unsigned int cpu_count = num_possible_cpus();
142 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
143
144 if (!bufs) {
145 pr_warn("Allocation failure, not measuring misaligned performance\n");
146 return;
147 }
148
149 /*
150 * Allocate separate buffers for each CPU so there's no fighting over
151 * cache lines.
152 */
153 for_each_cpu(cpu, cpu_online_mask) {
154 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
155 if (!bufs[cpu]) {
156 pr_warn("Allocation failure, not measuring misaligned performance\n");
157 goto out;
158 }
159 }
160
161 /* Check everybody except 0, who stays behind to tend jiffies. */
162 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
163
164 /* Check core 0. */
165 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
166
167 out:
168 for_each_cpu(cpu, cpu_online_mask) {
169 if (bufs[cpu])
170 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
171 }
172
173 kfree(bufs);
174 }
175 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
check_unaligned_access_speed_all_cpus(void)176 static void __init check_unaligned_access_speed_all_cpus(void)
177 {
178 }
179 #endif
180
181 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
182
modify_unaligned_access_branches(cpumask_t * mask,int weight)183 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
184 {
185 if (cpumask_weight(mask) == weight)
186 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
187 else
188 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
189 }
190
set_unaligned_access_static_branches_except_cpu(int cpu)191 static void set_unaligned_access_static_branches_except_cpu(int cpu)
192 {
193 /*
194 * Same as set_unaligned_access_static_branches, except excludes the
195 * given CPU from the result. When a CPU is hotplugged into an offline
196 * state, this function is called before the CPU is set to offline in
197 * the cpumask, and thus the CPU needs to be explicitly excluded.
198 */
199
200 cpumask_t fast_except_me;
201
202 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
203 cpumask_clear_cpu(cpu, &fast_except_me);
204
205 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
206 }
207
set_unaligned_access_static_branches(void)208 static void set_unaligned_access_static_branches(void)
209 {
210 /*
211 * This will be called after check_unaligned_access_all_cpus so the
212 * result of unaligned access speed for all CPUs will be available.
213 *
214 * To avoid the number of online cpus changing between reading
215 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
216 * held before calling this function.
217 */
218
219 cpumask_t fast_and_online;
220
221 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
222
223 modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
224 }
225
lock_and_set_unaligned_access_static_branch(void)226 static int __init lock_and_set_unaligned_access_static_branch(void)
227 {
228 cpus_read_lock();
229 set_unaligned_access_static_branches();
230 cpus_read_unlock();
231
232 return 0;
233 }
234
235 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
236
riscv_online_cpu(unsigned int cpu)237 static int riscv_online_cpu(unsigned int cpu)
238 {
239 int ret = cpu_online_unaligned_access_init(cpu);
240
241 if (ret)
242 return ret;
243
244 /* We are already set since the last check */
245 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
246 goto exit;
247 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
248 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
249 goto exit;
250 }
251
252 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
253 {
254 static struct page *buf;
255
256 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
257 if (!buf) {
258 pr_warn("Allocation failure, not measuring misaligned performance\n");
259 return -ENOMEM;
260 }
261
262 check_unaligned_access(buf);
263 __free_pages(buf, MISALIGNED_BUFFER_ORDER);
264 }
265 #endif
266
267 exit:
268 set_unaligned_access_static_branches();
269
270 return 0;
271 }
272
riscv_offline_cpu(unsigned int cpu)273 static int riscv_offline_cpu(unsigned int cpu)
274 {
275 set_unaligned_access_static_branches_except_cpu(cpu);
276
277 return 0;
278 }
279
280 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
check_vector_unaligned_access(struct work_struct * work __always_unused)281 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
282 {
283 int cpu = smp_processor_id();
284 u64 start_cycles, end_cycles;
285 u64 word_cycles;
286 u64 byte_cycles;
287 int ratio;
288 unsigned long start_jiffies, now;
289 struct page *page;
290 void *dst;
291 void *src;
292 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
293
294 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
295 return;
296
297 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
298 if (!page) {
299 pr_warn("Allocation failure, not measuring vector misaligned performance\n");
300 return;
301 }
302
303 /* Make an unaligned destination buffer. */
304 dst = (void *)((unsigned long)page_address(page) | 0x1);
305 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
306 src = dst + (MISALIGNED_BUFFER_SIZE / 2);
307 src += 2;
308 word_cycles = -1ULL;
309
310 /* Do a warmup. */
311 kernel_vector_begin();
312 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
313
314 start_jiffies = jiffies;
315 while ((now = jiffies) == start_jiffies)
316 cpu_relax();
317
318 /*
319 * For a fixed amount of time, repeatedly try the function, and take
320 * the best time in cycles as the measurement.
321 */
322 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
323 start_cycles = get_cycles64();
324 /* Ensure the CSR read can't reorder WRT to the copy. */
325 mb();
326 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
327 /* Ensure the copy ends before the end time is snapped. */
328 mb();
329 end_cycles = get_cycles64();
330 if ((end_cycles - start_cycles) < word_cycles)
331 word_cycles = end_cycles - start_cycles;
332 }
333
334 byte_cycles = -1ULL;
335 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
336 start_jiffies = jiffies;
337 while ((now = jiffies) == start_jiffies)
338 cpu_relax();
339
340 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
341 start_cycles = get_cycles64();
342 /* Ensure the CSR read can't reorder WRT to the copy. */
343 mb();
344 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
345 /* Ensure the copy ends before the end time is snapped. */
346 mb();
347 end_cycles = get_cycles64();
348 if ((end_cycles - start_cycles) < byte_cycles)
349 byte_cycles = end_cycles - start_cycles;
350 }
351
352 kernel_vector_end();
353
354 /* Don't divide by zero. */
355 if (!word_cycles || !byte_cycles) {
356 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
357 cpu);
358
359 goto free;
360 }
361
362 if (word_cycles < byte_cycles)
363 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
364
365 ratio = div_u64((byte_cycles * 100), word_cycles);
366 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
367 cpu,
368 ratio / 100,
369 ratio % 100,
370 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
371
372 per_cpu(vector_misaligned_access, cpu) = speed;
373
374 free:
375 __free_pages(page, MISALIGNED_BUFFER_ORDER);
376 }
377
378 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)379 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
380 {
381 schedule_on_each_cpu(check_vector_unaligned_access);
382
383 return 0;
384 }
385 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)386 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
387 {
388 return 0;
389 }
390 #endif
391
riscv_online_cpu_vec(unsigned int cpu)392 static int riscv_online_cpu_vec(unsigned int cpu)
393 {
394 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
395 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
396 return 0;
397 }
398
399 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
400 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
401 return 0;
402
403 check_vector_unaligned_access_emulated(NULL);
404 check_vector_unaligned_access(NULL);
405 #endif
406
407 return 0;
408 }
409
410 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
411
set_unaligned_scalar_speed_param(char * str)412 static int __init set_unaligned_scalar_speed_param(char *str)
413 {
414 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
415 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
416 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
417 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
418 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
419 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
420 else
421 return -EINVAL;
422
423 return 1;
424 }
425 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
426
set_unaligned_vector_speed_param(char * str)427 static int __init set_unaligned_vector_speed_param(char *str)
428 {
429 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
430 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
431 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
432 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
433 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
434 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
435 else
436 return -EINVAL;
437
438 return 1;
439 }
440 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
441
check_unaligned_access_all_cpus(void)442 static int __init check_unaligned_access_all_cpus(void)
443 {
444 int cpu;
445
446 unaligned_access_init();
447
448 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
449 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
450 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
451 for_each_online_cpu(cpu)
452 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
453 } else if (!check_unaligned_access_emulated_all_cpus()) {
454 check_unaligned_access_speed_all_cpus();
455 }
456
457 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
458 if (!has_vector() &&
459 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) {
460 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n",
461 speed_str[unaligned_vector_speed_param]);
462 } else {
463 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n",
464 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param);
465 }
466 }
467
468 if (!has_vector())
469 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
470
471 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
472 for_each_online_cpu(cpu)
473 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
474 } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
475 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
476 kthread_run(vec_check_unaligned_access_speed_all_cpus,
477 NULL, "vec_check_unaligned_access_speed_all_cpus");
478 }
479
480 /*
481 * Setup hotplug callbacks for any new CPUs that come online or go
482 * offline.
483 */
484 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
485 riscv_online_cpu, riscv_offline_cpu);
486 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
487 riscv_online_cpu_vec, NULL);
488
489 return 0;
490 }
491
492 arch_initcall(check_unaligned_access_all_cpus);
493