xref: /linux/arch/riscv/kernel/unaligned_access_speed.c (revision d585018a9258efed01514bae369ab3d4f21e7b1a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/kthread.h>
10 #include <linux/mm.h>
11 #include <linux/smp.h>
12 #include <linux/types.h>
13 #include <asm/cpufeature.h>
14 #include <asm/hwprobe.h>
15 #include <asm/vector.h>
16 
17 #include "copy-unaligned.h"
18 
19 #define MISALIGNED_ACCESS_NS 8000000
20 #define MISALIGNED_BUFFER_SIZE 0x4000
21 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
22 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
23 
24 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
25 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
26 
27 static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
28 static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
29 
30 static u64 __maybe_unused
31 measure_cycles(void (*func)(void *dst, const void *src, size_t len),
32 	       void *dst, void *src, size_t len)
33 {
34 	u64 start_cycles, end_cycles, cycles = -1ULL;
35 	u64 start_ns;
36 
37 	/* Do a warmup. */
38 	func(dst, src, len);
39 
40 	preempt_disable();
41 
42 	/*
43 	 * For a fixed amount of time, repeatedly try the function, and take
44 	 * the best time in cycles as the measurement.
45 	 */
46 	start_ns = ktime_get_mono_fast_ns();
47 	while (ktime_get_mono_fast_ns() < start_ns + MISALIGNED_ACCESS_NS) {
48 		start_cycles = get_cycles64();
49 		/* Ensure the CSR read can't reorder WRT to the copy. */
50 		mb();
51 		func(dst, src, len);
52 		/* Ensure the copy ends before the end time is snapped. */
53 		mb();
54 		end_cycles = get_cycles64();
55 		if ((end_cycles - start_cycles) < cycles)
56 			cycles = end_cycles - start_cycles;
57 	}
58 
59 	preempt_enable();
60 
61 	return cycles;
62 }
63 
64 /*
65  * Return:
66  *     1 if unaligned accesses are fast
67  *     0 if unaligned accesses are slow
68  *    -1 if check cannot be done
69  */
70 static int __maybe_unused
71 compare_unaligned_access(void (*word_copy)(void *dst, const void *src, size_t len),
72 			 void (*byte_copy)(void *dst, const void *src, size_t len),
73 			 void *buf, const char *type)
74 {
75 	int cpu = smp_processor_id();
76 	u64 word_cycles;
77 	u64 byte_cycles;
78 	void *dst, *src;
79 	bool fast;
80 	int ratio;
81 
82 	/* Make an unaligned destination buffer. */
83 	dst = (void *)((unsigned long)buf | 0x1);
84 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
85 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
86 	src += 2;
87 
88 	word_cycles = measure_cycles(word_copy, dst, src, MISALIGNED_COPY_SIZE);
89 	byte_cycles = measure_cycles(byte_copy, dst, src, MISALIGNED_COPY_SIZE);
90 
91 	/* Don't divide by zero. */
92 	if (!word_cycles || !byte_cycles) {
93 		pr_warn("cpu%d: rdtime lacks granularity needed to measure %s unaligned access speed\n",
94 			cpu, type);
95 
96 		return -1;
97 	}
98 
99 	fast = word_cycles < byte_cycles;
100 
101 	ratio = div_u64((byte_cycles * 100), word_cycles);
102 	pr_info("cpu%d: %s unaligned word access speed is %d.%02dx byte access speed (%s)\n",
103 		cpu,
104 		type,
105 		ratio / 100,
106 		ratio % 100,
107 		fast ? "fast" : "slow");
108 
109 	return fast;
110 }
111 
112 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
113 static int check_unaligned_access(struct page *page)
114 {
115 	void *buf = page_address(page);
116 	int cpu = smp_processor_id();
117 	int ret;
118 
119 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
120 		return 0;
121 
122 	ret = compare_unaligned_access(__riscv_copy_words_unaligned,
123 				       __riscv_copy_bytes_unaligned,
124 				       buf, "scalar");
125 	if (ret < 0)
126 		return 0;
127 
128 	/*
129 	 * Set the value of fast_misaligned_access of a CPU. These operations
130 	 * are atomic to avoid race conditions.
131 	 */
132 	if (ret)
133 		per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
134 	else
135 		per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
136 
137 	return 0;
138 }
139 
140 static void __init _check_unaligned_access(void *param)
141 {
142 	unsigned int cpu = smp_processor_id();
143 	struct page **pages = param;
144 
145 	check_unaligned_access(pages[cpu]);
146 }
147 
148 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
149 static void __init check_unaligned_access_speed_all_cpus(void)
150 {
151 	unsigned int cpu;
152 	unsigned int cpu_count = num_possible_cpus();
153 	struct page **bufs = kzalloc_objs(*bufs, cpu_count);
154 
155 	if (!bufs) {
156 		pr_warn("Allocation failure, not measuring misaligned performance\n");
157 		return;
158 	}
159 
160 	/*
161 	 * Allocate separate buffers for each CPU so there's no fighting over
162 	 * cache lines.
163 	 */
164 	for_each_cpu(cpu, cpu_online_mask) {
165 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
166 		if (!bufs[cpu]) {
167 			pr_warn("Allocation failure, not measuring misaligned performance\n");
168 			goto out;
169 		}
170 	}
171 
172 	on_each_cpu(_check_unaligned_access, bufs, 1);
173 
174 out:
175 	for_each_cpu(cpu, cpu_online_mask) {
176 		if (bufs[cpu])
177 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
178 	}
179 
180 	kfree(bufs);
181 }
182 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
183 static void __init check_unaligned_access_speed_all_cpus(void)
184 {
185 }
186 #endif
187 
188 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
189 
190 static void modify_unaligned_access_branches(const cpumask_t *mask)
191 {
192 	bool fast = true;
193 	int cpu;
194 
195 	for_each_cpu(cpu, mask) {
196 		if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) {
197 			fast = false;
198 			break;
199 		}
200 	}
201 
202 	if (fast)
203 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
204 	else
205 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
206 }
207 
208 static int riscv_online_cpu(unsigned int cpu)
209 {
210 	int ret = cpu_online_unaligned_access_init(cpu);
211 
212 	if (ret)
213 		return ret;
214 
215 	/* We are already set since the last check */
216 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
217 		goto exit;
218 	} else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
219 		per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
220 		goto exit;
221 	}
222 
223 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
224 	{
225 		static struct page *buf;
226 
227 		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
228 		if (!buf) {
229 			pr_warn("Allocation failure, not measuring misaligned performance\n");
230 			return -ENOMEM;
231 		}
232 
233 		check_unaligned_access(buf);
234 		__free_pages(buf, MISALIGNED_BUFFER_ORDER);
235 	}
236 #endif
237 
238 exit:
239 	modify_unaligned_access_branches(cpu_online_mask);
240 
241 	return 0;
242 }
243 
244 static int riscv_offline_cpu(unsigned int cpu)
245 {
246 	cpumask_t mask;
247 
248 	cpumask_copy(&mask, cpu_online_mask);
249 	cpumask_clear_cpu(cpu, &mask);
250 
251 	modify_unaligned_access_branches(&mask);
252 
253 	return 0;
254 }
255 
256 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
257 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
258 {
259 	int cpu = smp_processor_id();
260 	struct page *page;
261 	int ret;
262 
263 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
264 		return;
265 
266 	page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
267 	if (!page) {
268 		pr_warn("Allocation failure, not measuring vector misaligned performance\n");
269 		return;
270 	}
271 
272 	kernel_vector_begin();
273 
274 	ret = compare_unaligned_access(__riscv_copy_vec_words_unaligned,
275 				       __riscv_copy_vec_bytes_unaligned,
276 				       page_address(page), "vector");
277 	kernel_vector_end();
278 
279 	if (ret < 0)
280 		goto free;
281 
282 	if (ret)
283 		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
284 	else
285 		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
286 
287 free:
288 	__free_pages(page, MISALIGNED_BUFFER_ORDER);
289 }
290 
291 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
292 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
293 {
294 	schedule_on_each_cpu(check_vector_unaligned_access);
295 	riscv_hwprobe_complete_async_probe();
296 
297 	return 0;
298 }
299 #else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
300 static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
301 {
302 	return 0;
303 }
304 #endif
305 
306 static int riscv_online_cpu_vec(unsigned int cpu)
307 {
308 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
309 		per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
310 		return 0;
311 	}
312 
313 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
314 	if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
315 		return 0;
316 
317 	check_vector_unaligned_access_emulated(NULL);
318 	check_vector_unaligned_access(NULL);
319 #endif
320 
321 	return 0;
322 }
323 
324 static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
325 
326 static int __init set_unaligned_scalar_speed_param(char *str)
327 {
328 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
329 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
330 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
331 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
332 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
333 		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
334 	else
335 		return -EINVAL;
336 
337 	return 1;
338 }
339 __setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
340 
341 static int __init set_unaligned_vector_speed_param(char *str)
342 {
343 	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
344 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
345 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
346 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
347 	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
348 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
349 	else
350 		return -EINVAL;
351 
352 	return 1;
353 }
354 __setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
355 
356 static int __init check_unaligned_access_all_cpus(void)
357 {
358 	int cpu;
359 
360 	unaligned_access_init();
361 
362 	if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
363 		pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
364 			speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
365 		for_each_online_cpu(cpu)
366 			per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
367 	} else if (!check_unaligned_access_emulated_all_cpus()) {
368 		check_unaligned_access_speed_all_cpus();
369 	}
370 
371 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
372 		if (!has_vector() &&
373 		    unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) {
374 			pr_warn("vector support is not available, ignoring unaligned_vector_speed=%s\n",
375 				speed_str[unaligned_vector_speed_param]);
376 		} else {
377 			pr_info("vector unaligned access speed set to '%s' (%lu) by command line\n",
378 				speed_str[unaligned_vector_speed_param], unaligned_vector_speed_param);
379 		}
380 	}
381 
382 	if (!has_vector())
383 		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
384 
385 	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
386 		for_each_online_cpu(cpu)
387 			per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
388 	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
389 		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
390 		riscv_hwprobe_register_async_probe();
391 		if (IS_ERR(kthread_run(vec_check_unaligned_access_speed_all_cpus,
392 				       NULL, "vec_check_unaligned_access_speed_all_cpus"))) {
393 			pr_warn("Failed to create vec_unalign_check kthread\n");
394 			riscv_hwprobe_complete_async_probe();
395 		}
396 	}
397 
398 	/*
399 	 * Setup hotplug callbacks for any new CPUs that come online or go
400 	 * offline.
401 	 */
402 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
403 				  riscv_online_cpu, riscv_offline_cpu);
404 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
405 				  riscv_online_cpu_vec, NULL);
406 
407 	cpus_read_lock();
408 	modify_unaligned_access_branches(cpu_online_mask);
409 	cpus_read_unlock();
410 
411 	return 0;
412 }
413 
414 late_initcall(check_unaligned_access_all_cpus);
415