1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2024 Rivos Inc.
4 */
5
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16
17 #include "copy-unaligned.h"
18
19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26
27 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
28 static cpumask_t fast_misaligned_access;
check_unaligned_access(void * param)29 static int check_unaligned_access(void *param)
30 {
31 int cpu = smp_processor_id();
32 u64 start_cycles, end_cycles;
33 u64 word_cycles;
34 u64 byte_cycles;
35 int ratio;
36 unsigned long start_jiffies, now;
37 struct page *page = param;
38 void *dst;
39 void *src;
40 long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
41
42 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
43 return 0;
44
45 /* Make an unaligned destination buffer. */
46 dst = (void *)((unsigned long)page_address(page) | 0x1);
47 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
48 src = dst + (MISALIGNED_BUFFER_SIZE / 2);
49 src += 2;
50 word_cycles = -1ULL;
51 /* Do a warmup. */
52 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
53 preempt_disable();
54 start_jiffies = jiffies;
55 while ((now = jiffies) == start_jiffies)
56 cpu_relax();
57
58 /*
59 * For a fixed amount of time, repeatedly try the function, and take
60 * the best time in cycles as the measurement.
61 */
62 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
63 start_cycles = get_cycles64();
64 /* Ensure the CSR read can't reorder WRT to the copy. */
65 mb();
66 __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
67 /* Ensure the copy ends before the end time is snapped. */
68 mb();
69 end_cycles = get_cycles64();
70 if ((end_cycles - start_cycles) < word_cycles)
71 word_cycles = end_cycles - start_cycles;
72 }
73
74 byte_cycles = -1ULL;
75 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
76 start_jiffies = jiffies;
77 while ((now = jiffies) == start_jiffies)
78 cpu_relax();
79
80 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
81 start_cycles = get_cycles64();
82 mb();
83 __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
84 mb();
85 end_cycles = get_cycles64();
86 if ((end_cycles - start_cycles) < byte_cycles)
87 byte_cycles = end_cycles - start_cycles;
88 }
89
90 preempt_enable();
91
92 /* Don't divide by zero. */
93 if (!word_cycles || !byte_cycles) {
94 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
95 cpu);
96
97 return 0;
98 }
99
100 if (word_cycles < byte_cycles)
101 speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
102
103 ratio = div_u64((byte_cycles * 100), word_cycles);
104 pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
105 cpu,
106 ratio / 100,
107 ratio % 100,
108 (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
109
110 per_cpu(misaligned_access_speed, cpu) = speed;
111
112 /*
113 * Set the value of fast_misaligned_access of a CPU. These operations
114 * are atomic to avoid race conditions.
115 */
116 if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
117 cpumask_set_cpu(cpu, &fast_misaligned_access);
118 else
119 cpumask_clear_cpu(cpu, &fast_misaligned_access);
120
121 return 0;
122 }
123
check_unaligned_access_nonboot_cpu(void * param)124 static void check_unaligned_access_nonboot_cpu(void *param)
125 {
126 unsigned int cpu = smp_processor_id();
127 struct page **pages = param;
128
129 if (smp_processor_id() != 0)
130 check_unaligned_access(pages[cpu]);
131 }
132
133 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
134
modify_unaligned_access_branches(cpumask_t * mask,int weight)135 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
136 {
137 if (cpumask_weight(mask) == weight)
138 static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
139 else
140 static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
141 }
142
set_unaligned_access_static_branches_except_cpu(int cpu)143 static void set_unaligned_access_static_branches_except_cpu(int cpu)
144 {
145 /*
146 * Same as set_unaligned_access_static_branches, except excludes the
147 * given CPU from the result. When a CPU is hotplugged into an offline
148 * state, this function is called before the CPU is set to offline in
149 * the cpumask, and thus the CPU needs to be explicitly excluded.
150 */
151
152 cpumask_t fast_except_me;
153
154 cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
155 cpumask_clear_cpu(cpu, &fast_except_me);
156
157 modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
158 }
159
set_unaligned_access_static_branches(void)160 static void set_unaligned_access_static_branches(void)
161 {
162 /*
163 * This will be called after check_unaligned_access_all_cpus so the
164 * result of unaligned access speed for all CPUs will be available.
165 *
166 * To avoid the number of online cpus changing between reading
167 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
168 * held before calling this function.
169 */
170
171 cpumask_t fast_and_online;
172
173 cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
174
175 modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
176 }
177
lock_and_set_unaligned_access_static_branch(void)178 static int lock_and_set_unaligned_access_static_branch(void)
179 {
180 cpus_read_lock();
181 set_unaligned_access_static_branches();
182 cpus_read_unlock();
183
184 return 0;
185 }
186
187 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
188
riscv_online_cpu(unsigned int cpu)189 static int riscv_online_cpu(unsigned int cpu)
190 {
191 static struct page *buf;
192
193 /* We are already set since the last check */
194 if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
195 goto exit;
196
197 check_unaligned_access_emulated(NULL);
198 buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
199 if (!buf) {
200 pr_warn("Allocation failure, not measuring misaligned performance\n");
201 return -ENOMEM;
202 }
203
204 check_unaligned_access(buf);
205 __free_pages(buf, MISALIGNED_BUFFER_ORDER);
206
207 exit:
208 set_unaligned_access_static_branches();
209
210 return 0;
211 }
212
riscv_offline_cpu(unsigned int cpu)213 static int riscv_offline_cpu(unsigned int cpu)
214 {
215 set_unaligned_access_static_branches_except_cpu(cpu);
216
217 return 0;
218 }
219
220 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
check_unaligned_access_speed_all_cpus(void)221 static int check_unaligned_access_speed_all_cpus(void)
222 {
223 unsigned int cpu;
224 unsigned int cpu_count = num_possible_cpus();
225 struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
226
227 if (!bufs) {
228 pr_warn("Allocation failure, not measuring misaligned performance\n");
229 return 0;
230 }
231
232 /*
233 * Allocate separate buffers for each CPU so there's no fighting over
234 * cache lines.
235 */
236 for_each_cpu(cpu, cpu_online_mask) {
237 bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
238 if (!bufs[cpu]) {
239 pr_warn("Allocation failure, not measuring misaligned performance\n");
240 goto out;
241 }
242 }
243
244 /* Check everybody except 0, who stays behind to tend jiffies. */
245 on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
246
247 /* Check core 0. */
248 smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
249
250 /*
251 * Setup hotplug callbacks for any new CPUs that come online or go
252 * offline.
253 */
254 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
255 riscv_online_cpu, riscv_offline_cpu);
256
257 out:
258 for_each_cpu(cpu, cpu_online_mask) {
259 if (bufs[cpu])
260 __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
261 }
262
263 kfree(bufs);
264 return 0;
265 }
266 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
check_unaligned_access_speed_all_cpus(void)267 static int check_unaligned_access_speed_all_cpus(void)
268 {
269 return 0;
270 }
271 #endif
272
273 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
check_vector_unaligned_access(struct work_struct * work __always_unused)274 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
275 {
276 int cpu = smp_processor_id();
277 u64 start_cycles, end_cycles;
278 u64 word_cycles;
279 u64 byte_cycles;
280 int ratio;
281 unsigned long start_jiffies, now;
282 struct page *page;
283 void *dst;
284 void *src;
285 long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
286
287 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
288 return;
289
290 page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
291 if (!page) {
292 pr_warn("Allocation failure, not measuring vector misaligned performance\n");
293 return;
294 }
295
296 /* Make an unaligned destination buffer. */
297 dst = (void *)((unsigned long)page_address(page) | 0x1);
298 /* Unalign src as well, but differently (off by 1 + 2 = 3). */
299 src = dst + (MISALIGNED_BUFFER_SIZE / 2);
300 src += 2;
301 word_cycles = -1ULL;
302
303 /* Do a warmup. */
304 kernel_vector_begin();
305 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
306
307 start_jiffies = jiffies;
308 while ((now = jiffies) == start_jiffies)
309 cpu_relax();
310
311 /*
312 * For a fixed amount of time, repeatedly try the function, and take
313 * the best time in cycles as the measurement.
314 */
315 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
316 start_cycles = get_cycles64();
317 /* Ensure the CSR read can't reorder WRT to the copy. */
318 mb();
319 __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
320 /* Ensure the copy ends before the end time is snapped. */
321 mb();
322 end_cycles = get_cycles64();
323 if ((end_cycles - start_cycles) < word_cycles)
324 word_cycles = end_cycles - start_cycles;
325 }
326
327 byte_cycles = -1ULL;
328 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
329 start_jiffies = jiffies;
330 while ((now = jiffies) == start_jiffies)
331 cpu_relax();
332
333 while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
334 start_cycles = get_cycles64();
335 /* Ensure the CSR read can't reorder WRT to the copy. */
336 mb();
337 __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
338 /* Ensure the copy ends before the end time is snapped. */
339 mb();
340 end_cycles = get_cycles64();
341 if ((end_cycles - start_cycles) < byte_cycles)
342 byte_cycles = end_cycles - start_cycles;
343 }
344
345 kernel_vector_end();
346
347 /* Don't divide by zero. */
348 if (!word_cycles || !byte_cycles) {
349 pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
350 cpu);
351
352 return;
353 }
354
355 if (word_cycles < byte_cycles)
356 speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
357
358 ratio = div_u64((byte_cycles * 100), word_cycles);
359 pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
360 cpu,
361 ratio / 100,
362 ratio % 100,
363 (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
364
365 per_cpu(vector_misaligned_access, cpu) = speed;
366 }
367
riscv_online_cpu_vec(unsigned int cpu)368 static int riscv_online_cpu_vec(unsigned int cpu)
369 {
370 if (!has_vector())
371 return 0;
372
373 if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED)
374 return 0;
375
376 check_vector_unaligned_access_emulated(NULL);
377 check_vector_unaligned_access(NULL);
378 return 0;
379 }
380
381 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)382 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
383 {
384 schedule_on_each_cpu(check_vector_unaligned_access);
385
386 /*
387 * Setup hotplug callbacks for any new CPUs that come online or go
388 * offline.
389 */
390 cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
391 riscv_online_cpu_vec, NULL);
392
393 return 0;
394 }
395 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
vec_check_unaligned_access_speed_all_cpus(void * unused __always_unused)396 static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
397 {
398 return 0;
399 }
400 #endif
401
check_unaligned_access_all_cpus(void)402 static int check_unaligned_access_all_cpus(void)
403 {
404 bool all_cpus_emulated, all_cpus_vec_unsupported;
405
406 all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
407 all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus();
408
409 if (!all_cpus_vec_unsupported &&
410 IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
411 kthread_run(vec_check_unaligned_access_speed_all_cpus,
412 NULL, "vec_check_unaligned_access_speed_all_cpus");
413 }
414
415 if (!all_cpus_emulated)
416 return check_unaligned_access_speed_all_cpus();
417
418 return 0;
419 }
420
421 arch_initcall(check_unaligned_access_all_cpus);
422