1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2024 Rivos Inc.
4 */
5
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16
17 #include "copy-unaligned.h"
18
19 #define MISALIGNED_ACCESS_NS 8000000
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26
27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
29
30 static cpumask_t fast_misaligned_access;
31
32 static u64 __maybe_unused
measure_cycles(void (* func)(void * dst,const void * src,size_t len),void * dst,void * src,size_t len)33 measure_cycles(void (*func)(void *dst, const void *src, size_t len),
34 void *dst, void *src, size_t len)
35 {
36 u64 start_cycles, end_cycles, cycles = -1ULL;
37 u64 start_ns;
38
39 /* Do a warmup. */
40 func(dst, src, len);
41
42 preempt_disable();
43
44 /*
45 * For a fixed amount of time, repeatedly try the function, and take
46 * the best time in cycles as the measurement.
47 */
48 start_ns = ktime_get_mono_fast_ns();
49 while (ktime_get_mono_fast_ns() < start_ns + MISALIGNED_ACCESS_NS) {
50 start_cycles = get_cycles64();
51 /* Ensure the CSR read can't reorder WRT to the copy. */
52 mb();
53 func(dst, src, len);
54 /* Ensure the copy ends before the end time is snapped. */
55 mb();
56 end_cycles = get_cycles64();
57 if ((end_cycles - start_cycles) < cycles)
58 cycles = end_cycles - start_cycles;
59 }
60
61 preempt_enable();
62
63 return cycles;
64 }
65
66 /*
67 * Return:
68 * 1 if unaligned accesses are fast
69 * 0 if unaligned accesses are slow
70 * -1 if check cannot be done
71 */
72 static int __maybe_unused
compare_unaligned_access(void (* word_copy)(void * dst,const void * src,size_t len),void (* byte_copy)(void * dst,const void * src,size_t len),void * buf,const char * type)73 compare_unaligned_access(void (*word_copy)(void *dst, const void *src, size_t len),
74 void (*byte_copy)(void *dst, const void *src, size_t len),
75 void *buf, const char *type)
76 {
77 int cpu = smp_processor_id();
78 u64 word_cycles;
79 u64 byte_cycles;
80 void *dst, *src;
81 bool fast;
82 int ratio;
83
84 /* Make an unaligned destination buffer. */
85 dst = (void *)((unsigned long)buf | 0x1);
86 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
87 src = dst + (MISALIGNED_BUFFER_SIZE / 2);
88 src += 2;
89
90 word_cycles = measure_cycles(word_copy, dst, src, MISALIGNED_COPY_SIZE);
91 byte_cycles = measure_cycles(byte_copy, dst, src, MISALIGNED_COPY_SIZE);
92
93 /* Don't divide by zero. */
94 if (!word_cycles || !byte_cycles) {
95 pr_warn("cpu%d: rdtime lacks granularity needed to measure %s unaligned access speed\n",
96 cpu, type);
97
98 return -1;
99 }
100
101 fast = word_cycles < byte_cycles;
102
103 ratio = div_u64((byte_cycles * 100), word_cycles);
104 pr_info("cpu%d: %s unaligned word access speed is %d.%02dx byte access speed (%s)\n",
105 cpu,
106 type,
107 ratio / 100,
108 ratio % 100,
109 fast ? "fast" : "slow");
110
111 return fast;
112 }
113
114 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
check_unaligned_access(struct page * page)115 static int check_unaligned_access(struct page *page)
116 {
117 void *buf = page_address(page);
118 int cpu = smp_processor_id();
119 int ret;
120
121 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
122 return 0;
123
124 ret = compare_unaligned_access(__riscv_copy_words_unaligned,
125 __riscv_copy_bytes_unaligned,
126 buf, "scalar");
127 if (ret < 0)
128 return 0;
129
130 /*
131 * Set the value of fast_misaligned_access of a CPU. These operations
132 * are atomic to avoid race conditions.
133 */
134 if (ret) {
135 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
136 cpumask_set_cpu(cpu, &fast_misaligned_access);
137 } else {
138 per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
139 cpumask_clear_cpu(cpu, &fast_misaligned_access);
140 }
141
142 return 0;
143 }
144
_check_unaligned_access(void * param)145 static void __init _check_unaligned_access(void *param)
146 {
147 unsigned int cpu = smp_processor_id();
148 struct page **pages = param;
149
150 check_unaligned_access(pages[cpu]);
151 }
152
153 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
check_unaligned_access_speed_all_cpus(void)154 static void __init check_unaligned_access_speed_all_cpus(void)
155 {
156 unsigned int cpu;
157 unsigned int cpu_count = num_possible_cpus();
158 struct page **bufs = kzalloc_objs(*bufs, cpu_count);
159
160 if (!bufs) {
161 pr_warn("Allocation failure, not measuring misaligned performance\n");
162 return;
163 }
164
165 /*
166 * Allocate separate buffers for each CPU so there's no fighting over
167 * cache lines.
168 */
169 for_each_cpu(cpu, cpu_online_mask) {
170 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
171 if (!bufs[cpu]) {
172 pr_warn("Allocation failure, not measuring misaligned performance\n");
173 goto out;
174 }
175 }
176
177 on_each_cpu(_check_unaligned_access, bufs, 1);
178
179 out:
180 for_each_cpu(cpu, cpu_online_mask) {
181 if (bufs[cpu])
182 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
183 }
184
185 kfree(bufs);
186 }
187 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
check_unaligned_access_speed_all_cpus(void)188 static void __init check_unaligned_access_speed_all_cpus(void)
189 {
190 }
191 #endif
192
193 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
194
modify_unaligned_access_branches(cpumask_t * mask,int weight)195 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
196 {
197 if (cpumask_weight(mask) == weight)
198 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
199 else
200 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
201 }
202
set_unaligned_access_static_branches_except_cpu(int cpu)203 static void set_unaligned_access_static_branches_except_cpu(int cpu)
204 {
205 /*
206 * Same as set_unaligned_access_static_branches, except excludes the
207 * given CPU from the result. When a CPU is hotplugged into an offline
208 * state, this function is called before the CPU is set to offline in
209 * the cpumask, and thus the CPU needs to be explicitly excluded.
210 */
211
212 cpumask_t fast_except_me;
213
214 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
215 cpumask_clear_cpu(cpu, &fast_except_me);
216
217 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
218 }
219
set_unaligned_access_static_branches(void)220 static void set_unaligned_access_static_branches(void)
221 {
222 /*
223 * This will be called after check_unaligned_access_all_cpus so the
224 * result of unaligned access speed for all CPUs will be available.
225 *
226 * To avoid the number of online cpus changing between reading
227 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
228 * held before calling this function.
229 */
230
231 cpumask_t fast_and_online;
232
233 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
234
235 modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
236 }
237
lock_and_set_unaligned_access_static_branch(void)238 static int __init lock_and_set_unaligned_access_static_branch(void)
239 {
240 cpus_read_lock();
241 set_unaligned_access_static_branches();
242 cpus_read_unlock();
243
244 return 0;
245 }
246
247 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
248
riscv_online_cpu(unsigned int cpu)249 static int riscv_online_cpu(unsigned int cpu)
250 {
251 int ret = cpu_online_unaligned_access_init(cpu);
252
253 if (ret)
254 return ret;
255
256 /* We are already set since the last check */
257 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
258 goto exit;
259 } else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
260 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
261 goto exit;
262 }
263
264 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
265 {
266 static struct page *buf;
267
268 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
269 if (!buf) {
270 pr_warn("Allocation failure, not measuring misaligned performance\n");
271 return -ENOMEM;
272 }
273
274 check_unaligned_access(buf);
275 __free_pages(buf, MISALIGNED_BUFFER_ORDER);
276 }
277 #endif
278
279 exit:
280 set_unaligned_access_static_branches();
281
282 return 0;
283 }
284
riscv_offline_cpu(unsigned int cpu)285 static int riscv_offline_cpu(unsigned int cpu)
286 {
287 set_unaligned_access_static_branches_except_cpu(cpu);
288
289 return 0;
290 }
291
292 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
check_vector_unaligned_access(struct work_struct * work __always_unused)293 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
294 {
295 int cpu = smp_processor_id();
296 struct page *page;
297 int ret;
298
299 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
300 return;
301
302 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
303 if (!page) {
304 pr_warn("Allocation failure, not measuring vector misaligned performance\n");
305 return;
306 }
307
308 kernel_vector_begin();
309
310 ret = compare_unaligned_access(__riscv_copy_vec_words_unaligned,
311 __riscv_copy_vec_bytes_unaligned,
312 page_address(page), "vector");
313 kernel_vector_end();
314
315 if (ret < 0)
316 goto free;
317
318 if (ret)
319 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
320 else
321 per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
322
323 free:
324 __free_pages(page, MISALIGNED_BUFFER_ORDER);
325 }
326
327 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)328 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
329 {
330 schedule_on_each_cpu(check_vector_unaligned_access);
331 riscv_hwprobe_complete_async_probe();
332
333 return 0;
334 }
335 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)336 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
337 {
338 return 0;
339 }
340 #endif
341
riscv_online_cpu_vec(unsigned int cpu)342 static int riscv_online_cpu_vec(unsigned int cpu)
343 {
344 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
345 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
346 return 0;
347 }
348
349 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
350 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
351 return 0;
352
353 check_vector_unaligned_access_emulated(NULL);
354 check_vector_unaligned_access(NULL);
355 #endif
356
357 return 0;
358 }
359
360 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
361
set_unaligned_scalar_speed_param(char * str)362 static int __init set_unaligned_scalar_speed_param(char *str)
363 {
364 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
365 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
366 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
367 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
368 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
369 unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
370 else
371 return -EINVAL;
372
373 return 1;
374 }
375 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
376
set_unaligned_vector_speed_param(char * str)377 static int __init set_unaligned_vector_speed_param(char *str)
378 {
379 if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
380 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
381 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
382 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
383 else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
384 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
385 else
386 return -EINVAL;
387
388 return 1;
389 }
390 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
391
check_unaligned_access_all_cpus(void)392 static int __init check_unaligned_access_all_cpus(void)
393 {
394 int cpu;
395
396 unaligned_access_init();
397
398 if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
399 pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
400 speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
401 for_each_online_cpu(cpu)
402 per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
403 } else if (!check_unaligned_access_emulated_all_cpus()) {
404 check_unaligned_access_speed_all_cpus();
405 }
406
407 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
408 if (!has_vector() &&
409 unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) {
410 pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n",
411 speed_str[unaligned_vector_speed_param]);
412 } else {
413 pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n",
414 speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param);
415 }
416 }
417
418 if (!has_vector())
419 unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
420
421 if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
422 for_each_online_cpu(cpu)
423 per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
424 } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
425 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
426 riscv_hwprobe_register_async_probe();
427 if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
428 NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
429 pr_warn("Failed to create vec_unalign_check kthread\n");
430 riscv_hwprobe_complete_async_probe();
431 }
432 }
433
434 /*
435 * Setup hotplug callbacks for any new CPUs that come online or go
436 * offline.
437 */
438 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
439 riscv_online_cpu, riscv_offline_cpu);
440 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
441 riscv_online_cpu_vec, NULL);
442
443 return 0;
444 }
445
446 late_initcall(check_unaligned_access_all_cpus);
447