xref: /linux/arch/riscv/kernel/unaligned_access_speed.c (revision 15b4155138505669d3d43d7692459ee8ea2a86e7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16 
17 #include "copy-unaligned.h"
18 
19 #define MISALIGNED_ACCESS_NS 8000000
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23 
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26 
27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
29 
30 static cpumask_t fast_misaligned_access;
31 
32 static u64 __maybe_unused
33 measure_cycles(void (*func)(void *dst, const void *src, size_t len),
34 	       void *dst, void *src, size_t len)
35 {
36 	u64 start_cycles, end_cycles, cycles = -1ULL;
37 	u64 start_ns;
38 
39 	/* Do a warmup. */
40 	func(dst, src, len);
41 
42 	preempt_disable();
43 
44 	/*
45 	 * For a fixed amount of time, repeatedly try the function, and take
46 	 * the best time in cycles as the measurement.
47 	 */
48 	start_ns = ktime_get_mono_fast_ns();
49 	while (ktime_get_mono_fast_ns() < start_ns + MISALIGNED_ACCESS_NS) {
50 		start_cycles = get_cycles64();
51 		/* Ensure the CSR read can't reorder WRT to the copy. */
52 		mb();
53 		func(dst, src, len);
54 		/* Ensure the copy ends before the end time is snapped. */
55 		mb();
56 		end_cycles = get_cycles64();
57 		if ((end_cycles - start_cycles) < cycles)
58 			cycles = end_cycles - start_cycles;
59 	}
60 
61 	preempt_enable();
62 
63 	return cycles;
64 }
65 
66 /*
67  * Return:
68  *     1 if unaligned accesses are fast
69  *     0 if unaligned accesses are slow
70  *    -1 if check cannot be done
71  */
72 static int __maybe_unused
73 compare_unaligned_access(void (*word_copy)(void *dst, const void *src, size_t len),
74 			 void (*byte_copy)(void *dst, const void *src, size_t len),
75 			 void *buf, const char *type)
76 {
77 	int cpu = smp_processor_id();
78 	u64 word_cycles;
79 	u64 byte_cycles;
80 	void *dst, *src;
81 	bool fast;
82 	int ratio;
83 
84 	/* Make an unaligned destination buffer. */
85 	dst = (void *)((unsigned long)buf | 0x1);
86 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
87 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
88 	src += 2;
89 
90 	word_cycles = measure_cycles(word_copy, dst, src, MISALIGNED_COPY_SIZE);
91 	byte_cycles = measure_cycles(byte_copy, dst, src, MISALIGNED_COPY_SIZE);
92 
93 	/* Don't divide by zero. */
94 	if (!word_cycles || !byte_cycles) {
95 		pr_warn("cpu%d: rdtime lacks granularity needed to measure %s unaligned access speed\n",
96 			cpu, type);
97 
98 		return -1;
99 	}
100 
101 	fast = word_cycles < byte_cycles;
102 
103 	ratio = div_u64((byte_cycles * 100), word_cycles);
104 	pr_info("cpu%d: %s unaligned word access speed is %d.%02dx byte access speed (%s)\n",
105 		cpu,
106 		type,
107 		ratio / 100,
108 		ratio % 100,
109 		fast ? "fast" : "slow");
110 
111 	return fast;
112 }
113 
114 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
115 static int check_unaligned_access(struct page *page)
116 {
117 	void *buf = page_address(page);
118 	int cpu = smp_processor_id();
119 	int ret;
120 
121 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
122 		return 0;
123 
124 	ret = compare_unaligned_access(__riscv_copy_words_unaligned,
125 				       __riscv_copy_bytes_unaligned,
126 				       buf, "scalar");
127 	if (ret < 0)
128 		return 0;
129 
130 	/*
131 	 * Set the value of fast_misaligned_access of a CPU. These operations
132 	 * are atomic to avoid race conditions.
133 	 */
134 	if (ret) {
135 		per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
136 		cpumask_set_cpu(cpu, &fast_misaligned_access);
137 	} else {
138 		per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
139 		cpumask_clear_cpu(cpu, &fast_misaligned_access);
140 	}
141 
142 	return 0;
143 }
144 
145 static void __init _check_unaligned_access(void *param)
146 {
147 	unsigned int cpu = smp_processor_id();
148 	struct page **pages = param;
149 
150 	check_unaligned_access(pages[cpu]);
151 }
152 
153 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
154 static void __init check_unaligned_access_speed_all_cpus(void)
155 {
156 	unsigned int cpu;
157 	unsigned int cpu_count = num_possible_cpus();
158 	struct page **bufs = kzalloc_objs(*bufs, cpu_count);
159 
160 	if (!bufs) {
161 		pr_warn("Allocation failure, not measuring misaligned performance\n");
162 		return;
163 	}
164 
165 	/*
166 	 * Allocate separate buffers for each CPU so there's no fighting over
167 	 * cache lines.
168 	 */
169 	for_each_cpu(cpu, cpu_online_mask) {
170 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
171 		if (!bufs[cpu]) {
172 			pr_warn("Allocation failure, not measuring misaligned performance\n");
173 			goto out;
174 		}
175 	}
176 
177 	on_each_cpu(_check_unaligned_access, bufs, 1);
178 
179 out:
180 	for_each_cpu(cpu, cpu_online_mask) {
181 		if (bufs[cpu])
182 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
183 	}
184 
185 	kfree(bufs);
186 }
187 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
188 static void __init check_unaligned_access_speed_all_cpus(void)
189 {
190 }
191 #endif
192 
193 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
194 
195 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
196 {
197 	if (cpumask_weight(mask) == weight)
198 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
199 	else
200 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
201 }
202 
203 static void set_unaligned_access_static_branches_except_cpu(int cpu)
204 {
205 	/*
206 	 * Same as set_unaligned_access_static_branches, except excludes the
207 	 * given CPU from the result. When a CPU is hotplugged into an offline
208 	 * state, this function is called before the CPU is set to offline in
209 	 * the cpumask, and thus the CPU needs to be explicitly excluded.
210 	 */
211 
212 	cpumask_t fast_except_me;
213 
214 	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
215 	cpumask_clear_cpu(cpu, &fast_except_me);
216 
217 	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
218 }
219 
220 static void set_unaligned_access_static_branches(void)
221 {
222 	/*
223 	 * This will be called after check_unaligned_access_all_cpus so the
224 	 * result of unaligned access speed for all CPUs will be available.
225 	 *
226 	 * To avoid the number of online cpus changing between reading
227 	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
228 	 * held before calling this function.
229 	 */
230 
231 	cpumask_t fast_and_online;
232 
233 	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
234 
235 	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
236 }
237 
238 static int riscv_online_cpu(unsigned int cpu)
239 {
240 	int ret = cpu_online_unaligned_access_init(cpu);
241 
242 	if (ret)
243 		return ret;
244 
245 	/* We are already set since the last check */
246 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
247 		goto exit;
248 	} else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
249 		per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
250 		goto exit;
251 	}
252 
253 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
254 	{
255 		static struct page *buf;
256 
257 		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
258 		if (!buf) {
259 			pr_warn("Allocation failure, not measuring misaligned performance\n");
260 			return -ENOMEM;
261 		}
262 
263 		check_unaligned_access(buf);
264 		__free_pages(buf, MISALIGNED_BUFFER_ORDER);
265 	}
266 #endif
267 
268 exit:
269 	set_unaligned_access_static_branches();
270 
271 	return 0;
272 }
273 
274 static int riscv_offline_cpu(unsigned int cpu)
275 {
276 	set_unaligned_access_static_branches_except_cpu(cpu);
277 
278 	return 0;
279 }
280 
281 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
282 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
283 {
284 	int cpu = smp_processor_id();
285 	struct page *page;
286 	int ret;
287 
288 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
289 		return;
290 
291 	page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
292 	if (!page) {
293 		pr_warn("Allocation failure, not measuring vector misaligned performance\n");
294 		return;
295 	}
296 
297 	kernel_vector_begin();
298 
299 	ret = compare_unaligned_access(__riscv_copy_vec_words_unaligned,
300 				       __riscv_copy_vec_bytes_unaligned,
301 				       page_address(page), "vector");
302 	kernel_vector_end();
303 
304 	if (ret < 0)
305 		goto free;
306 
307 	if (ret)
308 		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
309 	else
310 		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
311 
312 free:
313 	__free_pages(page, MISALIGNED_BUFFER_ORDER);
314 }
315 
316 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
317 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
318 {
319 	schedule_on_each_cpu(check_vector_unaligned_access);
320 	riscv_hwprobe_complete_async_probe();
321 
322 	return 0;
323 }
324 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
325 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
326 {
327 	return 0;
328 }
329 #endif
330 
331 static int riscv_online_cpu_vec(unsigned int cpu)
332 {
333 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
334 		per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
335 		return 0;
336 	}
337 
338 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
339 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
340 		return 0;
341 
342 	check_vector_unaligned_access_emulated(NULL);
343 	check_vector_unaligned_access(NULL);
344 #endif
345 
346 	return 0;
347 }
348 
349 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
350 
351 static int __init set_unaligned_scalar_speed_param(char *str)
352 {
353 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
354 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
355 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
356 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
357 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
358 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
359 	else
360 		return -EINVAL;
361 
362 	return 1;
363 }
364 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
365 
366 static int __init set_unaligned_vector_speed_param(char *str)
367 {
368 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
369 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
370 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
371 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
372 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
373 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
374 	else
375 		return -EINVAL;
376 
377 	return 1;
378 }
379 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
380 
381 static int __init check_unaligned_access_all_cpus(void)
382 {
383 	int cpu;
384 
385 	unaligned_access_init();
386 
387 	if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
388 		pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
389 			speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
390 		for_each_online_cpu(cpu)
391 			per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
392 	} else if (!check_unaligned_access_emulated_all_cpus()) {
393 		check_unaligned_access_speed_all_cpus();
394 	}
395 
396 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
397 		if (!has_vector() &&
398 		    unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) {
399 			pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n",
400 				speed_str[unaligned_vector_speed_param]);
401 		} else {
402 			pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n",
403 				speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param);
404 		}
405 	}
406 
407 	if (!has_vector())
408 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
409 
410 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
411 		for_each_online_cpu(cpu)
412 			per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
413 	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
414 		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
415 		riscv_hwprobe_register_async_probe();
416 		if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
417 				       NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
418 			pr_warn("Failed to create vec_unalign_check kthread\n");
419 			riscv_hwprobe_complete_async_probe();
420 		}
421 	}
422 
423 	/*
424 	 * Setup hotplug callbacks for any new CPUs that come online or go
425 	 * offline.
426 	 */
427 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
428 				  riscv_online_cpu, riscv_offline_cpu);
429 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
430 				  riscv_online_cpu_vec, NULL);
431 
432 	cpus_read_lock();
433 	set_unaligned_access_static_branches();
434 	cpus_read_unlock();
435 
436 	return 0;
437 }
438 
439 late_initcall(check_unaligned_access_all_cpus);
440