xref: /linux/arch/riscv/kernel/unaligned_access_speed.c (revision 09b1704f5b02c18dd02b21343530463fcfc92c54)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16 
17 #include "copy-unaligned.h"
18 
19 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23 
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26 
27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
29 
30 static cpumask_t fast_misaligned_access;
31 
32 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
33 static int check_unaligned_access(void *param)
34 {
35 	int cpu = smp_processor_id();
36 	u64 start_cycles, end_cycles;
37 	u64 word_cycles;
38 	u64 byte_cycles;
39 	int ratio;
40 	unsigned long start_jiffies, now;
41 	struct page *page = param;
42 	void *dst;
43 	void *src;
44 	long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
45 
46 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
47 		return 0;
48 
49 	/* Make an unaligned destination buffer. */
50 	dst = (void *)((unsigned long)page_address(page) | 0x1);
51 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
52 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
53 	src += 2;
54 	word_cycles = -1ULL;
55 	/* Do a warmup. */
56 	__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
57 	preempt_disable();
58 	start_jiffies = jiffies;
59 	while ((now = jiffies) == start_jiffies)
60 		cpu_relax();
61 
62 	/*
63 	 * For a fixed amount of time, repeatedly try the function, and take
64 	 * the best time in cycles as the measurement.
65 	 */
66 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
67 		start_cycles = get_cycles64();
68 		/* Ensure the CSR read can't reorder WRT to the copy. */
69 		mb();
70 		__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
71 		/* Ensure the copy ends before the end time is snapped. */
72 		mb();
73 		end_cycles = get_cycles64();
74 		if ((end_cycles - start_cycles) < word_cycles)
75 			word_cycles = end_cycles - start_cycles;
76 	}
77 
78 	byte_cycles = -1ULL;
79 	__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
80 	start_jiffies = jiffies;
81 	while ((now = jiffies) == start_jiffies)
82 		cpu_relax();
83 
84 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
85 		start_cycles = get_cycles64();
86 		mb();
87 		__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
88 		mb();
89 		end_cycles = get_cycles64();
90 		if ((end_cycles - start_cycles) < byte_cycles)
91 			byte_cycles = end_cycles - start_cycles;
92 	}
93 
94 	preempt_enable();
95 
96 	/* Don't divide by zero. */
97 	if (!word_cycles || !byte_cycles) {
98 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
99 			cpu);
100 
101 		return 0;
102 	}
103 
104 	if (word_cycles < byte_cycles)
105 		speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
106 
107 	ratio = div_u64((byte_cycles * 100), word_cycles);
108 	pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
109 		cpu,
110 		ratio / 100,
111 		ratio % 100,
112 		(speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
113 
114 	per_cpu(misaligned_access_speed, cpu) = speed;
115 
116 	/*
117 	 * Set the value of fast_misaligned_access of a CPU. These operations
118 	 * are atomic to avoid race conditions.
119 	 */
120 	if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
121 		cpumask_set_cpu(cpu, &fast_misaligned_access);
122 	else
123 		cpumask_clear_cpu(cpu, &fast_misaligned_access);
124 
125 	return 0;
126 }
127 
128 static void __init check_unaligned_access_nonboot_cpu(void *param)
129 {
130 	unsigned int cpu = smp_processor_id();
131 	struct page **pages = param;
132 
133 	if (smp_processor_id() != 0)
134 		check_unaligned_access(pages[cpu]);
135 }
136 
137 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
138 static void __init check_unaligned_access_speed_all_cpus(void)
139 {
140 	unsigned int cpu;
141 	unsigned int cpu_count = num_possible_cpus();
142 	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
143 
144 	if (!bufs) {
145 		pr_warn("Allocation failure, not measuring misaligned performance\n");
146 		return;
147 	}
148 
149 	/*
150 	 * Allocate separate buffers for each CPU so there's no fighting over
151 	 * cache lines.
152 	 */
153 	for_each_cpu(cpu, cpu_online_mask) {
154 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
155 		if (!bufs[cpu]) {
156 			pr_warn("Allocation failure, not measuring misaligned performance\n");
157 			goto out;
158 		}
159 	}
160 
161 	/* Check everybody except 0, who stays behind to tend jiffies. */
162 	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
163 
164 	/* Check core 0. */
165 	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
166 
167 out:
168 	for_each_cpu(cpu, cpu_online_mask) {
169 		if (bufs[cpu])
170 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
171 	}
172 
173 	kfree(bufs);
174 }
175 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
176 static void __init check_unaligned_access_speed_all_cpus(void)
177 {
178 }
179 #endif
180 
181 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
182 
183 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
184 {
185 	if (cpumask_weight(mask) == weight)
186 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
187 	else
188 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
189 }
190 
191 static void set_unaligned_access_static_branches_except_cpu(int cpu)
192 {
193 	/*
194 	 * Same as set_unaligned_access_static_branches, except excludes the
195 	 * given CPU from the result. When a CPU is hotplugged into an offline
196 	 * state, this function is called before the CPU is set to offline in
197 	 * the cpumask, and thus the CPU needs to be explicitly excluded.
198 	 */
199 
200 	cpumask_t fast_except_me;
201 
202 	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
203 	cpumask_clear_cpu(cpu, &fast_except_me);
204 
205 	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
206 }
207 
208 static void set_unaligned_access_static_branches(void)
209 {
210 	/*
211 	 * This will be called after check_unaligned_access_all_cpus so the
212 	 * result of unaligned access speed for all CPUs will be available.
213 	 *
214 	 * To avoid the number of online cpus changing between reading
215 	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
216 	 * held before calling this function.
217 	 */
218 
219 	cpumask_t fast_and_online;
220 
221 	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
222 
223 	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
224 }
225 
226 static int __init lock_and_set_unaligned_access_static_branch(void)
227 {
228 	cpus_read_lock();
229 	set_unaligned_access_static_branches();
230 	cpus_read_unlock();
231 
232 	return 0;
233 }
234 
235 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
236 
237 static int riscv_online_cpu(unsigned int cpu)
238 {
239 	int ret = cpu_online_unaligned_access_init(cpu);
240 
241 	if (ret)
242 		return ret;
243 
244 	/* We are already set since the last check */
245 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
246 		goto exit;
247 	} else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
248 		per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
249 		goto exit;
250 	}
251 
252 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
253 	{
254 		static struct page *buf;
255 
256 		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
257 		if (!buf) {
258 			pr_warn("Allocation failure, not measuring misaligned performance\n");
259 			return -ENOMEM;
260 		}
261 
262 		check_unaligned_access(buf);
263 		__free_pages(buf, MISALIGNED_BUFFER_ORDER);
264 	}
265 #endif
266 
267 exit:
268 	set_unaligned_access_static_branches();
269 
270 	return 0;
271 }
272 
273 static int riscv_offline_cpu(unsigned int cpu)
274 {
275 	set_unaligned_access_static_branches_except_cpu(cpu);
276 
277 	return 0;
278 }
279 
280 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
281 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
282 {
283 	int cpu = smp_processor_id();
284 	u64 start_cycles, end_cycles;
285 	u64 word_cycles;
286 	u64 byte_cycles;
287 	int ratio;
288 	unsigned long start_jiffies, now;
289 	struct page *page;
290 	void *dst;
291 	void *src;
292 	long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
293 
294 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
295 		return;
296 
297 	page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
298 	if (!page) {
299 		pr_warn("Allocation failure, not measuring vector misaligned performance\n");
300 		return;
301 	}
302 
303 	/* Make an unaligned destination buffer. */
304 	dst = (void *)((unsigned long)page_address(page) | 0x1);
305 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
306 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
307 	src += 2;
308 	word_cycles = -1ULL;
309 
310 	/* Do a warmup. */
311 	kernel_vector_begin();
312 	__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
313 
314 	start_jiffies = jiffies;
315 	while ((now = jiffies) == start_jiffies)
316 		cpu_relax();
317 
318 	/*
319 	 * For a fixed amount of time, repeatedly try the function, and take
320 	 * the best time in cycles as the measurement.
321 	 */
322 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
323 		start_cycles = get_cycles64();
324 		/* Ensure the CSR read can't reorder WRT to the copy. */
325 		mb();
326 		__riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
327 		/* Ensure the copy ends before the end time is snapped. */
328 		mb();
329 		end_cycles = get_cycles64();
330 		if ((end_cycles - start_cycles) < word_cycles)
331 			word_cycles = end_cycles - start_cycles;
332 	}
333 
334 	byte_cycles = -1ULL;
335 	__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
336 	start_jiffies = jiffies;
337 	while ((now = jiffies) == start_jiffies)
338 		cpu_relax();
339 
340 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
341 		start_cycles = get_cycles64();
342 		/* Ensure the CSR read can't reorder WRT to the copy. */
343 		mb();
344 		__riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
345 		/* Ensure the copy ends before the end time is snapped. */
346 		mb();
347 		end_cycles = get_cycles64();
348 		if ((end_cycles - start_cycles) < byte_cycles)
349 			byte_cycles = end_cycles - start_cycles;
350 	}
351 
352 	kernel_vector_end();
353 
354 	/* Don't divide by zero. */
355 	if (!word_cycles || !byte_cycles) {
356 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
357 			cpu);
358 
359 		goto free;
360 	}
361 
362 	if (word_cycles < byte_cycles)
363 		speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
364 
365 	ratio = div_u64((byte_cycles * 100), word_cycles);
366 	pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
367 		cpu,
368 		ratio / 100,
369 		ratio % 100,
370 		(speed ==  RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
371 
372 	per_cpu(vector_misaligned_access, cpu) = speed;
373 
374 free:
375 	__free_pages(page, MISALIGNED_BUFFER_ORDER);
376 }
377 
378 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
379 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
380 {
381 	schedule_on_each_cpu(check_vector_unaligned_access);
382 	riscv_hwprobe_complete_async_probe();
383 
384 	return 0;
385 }
386 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
387 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
388 {
389 	return 0;
390 }
391 #endif
392 
393 static int riscv_online_cpu_vec(unsigned int cpu)
394 {
395 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
396 		per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
397 		return 0;
398 	}
399 
400 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
401 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
402 		return 0;
403 
404 	check_vector_unaligned_access_emulated(NULL);
405 	check_vector_unaligned_access(NULL);
406 #endif
407 
408 	return 0;
409 }
410 
411 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
412 
413 static int __init set_unaligned_scalar_speed_param(char *str)
414 {
415 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
416 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
417 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
418 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
419 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
420 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
421 	else
422 		return -EINVAL;
423 
424 	return 1;
425 }
426 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
427 
428 static int __init set_unaligned_vector_speed_param(char *str)
429 {
430 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
431 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
432 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
433 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
434 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
435 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
436 	else
437 		return -EINVAL;
438 
439 	return 1;
440 }
441 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
442 
443 static int __init check_unaligned_access_all_cpus(void)
444 {
445 	int cpu;
446 
447 	unaligned_access_init();
448 
449 	if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
450 		pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
451 			speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
452 		for_each_online_cpu(cpu)
453 			per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
454 	} else if (!check_unaligned_access_emulated_all_cpus()) {
455 		check_unaligned_access_speed_all_cpus();
456 	}
457 
458 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
459 		if (!has_vector() &&
460 		    unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) {
461 			pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n",
462 				speed_str[unaligned_vector_speed_param]);
463 		} else {
464 			pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n",
465 				speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param);
466 		}
467 	}
468 
469 	if (!has_vector())
470 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
471 
472 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
473 		for_each_online_cpu(cpu)
474 			per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
475 	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
476 		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
477 		riscv_hwprobe_register_async_probe();
478 		if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
479 				       NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
480 			pr_warn("Failed to create vec_unalign_check kthread\n");
481 			riscv_hwprobe_complete_async_probe();
482 		}
483 	}
484 
485 	/*
486 	 * Setup hotplug callbacks for any new CPUs that come online or go
487 	 * offline.
488 	 */
489 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
490 				  riscv_online_cpu, riscv_offline_cpu);
491 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
492 				  riscv_online_cpu_vec, NULL);
493 
494 	return 0;
495 }
496 
497 arch_initcall(check_unaligned_access_all_cpus);
498