xref: /linux/arch/riscv/kernel/unaligned_access_speed.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2024 Rivos Inc.
4  */
5 
6 #include <linux/cpu.h>
7 #include <linux/cpumask.h>
8 #include <linux/jump_label.h>
9 #include <linux/mm.h>
10 #include <linux/smp.h>
11 #include <linux/types.h>
12 #include <asm/cpufeature.h>
13 #include <asm/hwprobe.h>
14 
15 #include "copy-unaligned.h"
16 
17 #define MISALIGNED_ACCESS_JIFFIES_LG2 1
18 #define MISALIGNED_BUFFER_SIZE 0x4000
19 #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
20 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
21 
22 DEFINE_PER_CPU(long, misaligned_access_speed);
23 
24 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
25 static cpumask_t fast_misaligned_access;
26 static int check_unaligned_access(void *param)
27 {
28 	int cpu = smp_processor_id();
29 	u64 start_cycles, end_cycles;
30 	u64 word_cycles;
31 	u64 byte_cycles;
32 	int ratio;
33 	unsigned long start_jiffies, now;
34 	struct page *page = param;
35 	void *dst;
36 	void *src;
37 	long speed = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
38 
39 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
40 		return 0;
41 
42 	/* Make an unaligned destination buffer. */
43 	dst = (void *)((unsigned long)page_address(page) | 0x1);
44 	/* Unalign src as well, but differently (off by 1 + 2 = 3). */
45 	src = dst + (MISALIGNED_BUFFER_SIZE / 2);
46 	src += 2;
47 	word_cycles = -1ULL;
48 	/* Do a warmup. */
49 	__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
50 	preempt_disable();
51 	start_jiffies = jiffies;
52 	while ((now = jiffies) == start_jiffies)
53 		cpu_relax();
54 
55 	/*
56 	 * For a fixed amount of time, repeatedly try the function, and take
57 	 * the best time in cycles as the measurement.
58 	 */
59 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
60 		start_cycles = get_cycles64();
61 		/* Ensure the CSR read can't reorder WRT to the copy. */
62 		mb();
63 		__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
64 		/* Ensure the copy ends before the end time is snapped. */
65 		mb();
66 		end_cycles = get_cycles64();
67 		if ((end_cycles - start_cycles) < word_cycles)
68 			word_cycles = end_cycles - start_cycles;
69 	}
70 
71 	byte_cycles = -1ULL;
72 	__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
73 	start_jiffies = jiffies;
74 	while ((now = jiffies) == start_jiffies)
75 		cpu_relax();
76 
77 	while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
78 		start_cycles = get_cycles64();
79 		mb();
80 		__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
81 		mb();
82 		end_cycles = get_cycles64();
83 		if ((end_cycles - start_cycles) < byte_cycles)
84 			byte_cycles = end_cycles - start_cycles;
85 	}
86 
87 	preempt_enable();
88 
89 	/* Don't divide by zero. */
90 	if (!word_cycles || !byte_cycles) {
91 		pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
92 			cpu);
93 
94 		return 0;
95 	}
96 
97 	if (word_cycles < byte_cycles)
98 		speed = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
99 
100 	ratio = div_u64((byte_cycles * 100), word_cycles);
101 	pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
102 		cpu,
103 		ratio / 100,
104 		ratio % 100,
105 		(speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) ? "fast" : "slow");
106 
107 	per_cpu(misaligned_access_speed, cpu) = speed;
108 
109 	/*
110 	 * Set the value of fast_misaligned_access of a CPU. These operations
111 	 * are atomic to avoid race conditions.
112 	 */
113 	if (speed == RISCV_HWPROBE_MISALIGNED_SCALAR_FAST)
114 		cpumask_set_cpu(cpu, &fast_misaligned_access);
115 	else
116 		cpumask_clear_cpu(cpu, &fast_misaligned_access);
117 
118 	return 0;
119 }
120 
121 static void check_unaligned_access_nonboot_cpu(void *param)
122 {
123 	unsigned int cpu = smp_processor_id();
124 	struct page **pages = param;
125 
126 	if (smp_processor_id() != 0)
127 		check_unaligned_access(pages[cpu]);
128 }
129 
130 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
131 
132 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
133 {
134 	if (cpumask_weight(mask) == weight)
135 		static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
136 	else
137 		static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
138 }
139 
140 static void set_unaligned_access_static_branches_except_cpu(int cpu)
141 {
142 	/*
143 	 * Same as set_unaligned_access_static_branches, except excludes the
144 	 * given CPU from the result. When a CPU is hotplugged into an offline
145 	 * state, this function is called before the CPU is set to offline in
146 	 * the cpumask, and thus the CPU needs to be explicitly excluded.
147 	 */
148 
149 	cpumask_t fast_except_me;
150 
151 	cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
152 	cpumask_clear_cpu(cpu, &fast_except_me);
153 
154 	modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
155 }
156 
157 static void set_unaligned_access_static_branches(void)
158 {
159 	/*
160 	 * This will be called after check_unaligned_access_all_cpus so the
161 	 * result of unaligned access speed for all CPUs will be available.
162 	 *
163 	 * To avoid the number of online cpus changing between reading
164 	 * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
165 	 * held before calling this function.
166 	 */
167 
168 	cpumask_t fast_and_online;
169 
170 	cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
171 
172 	modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
173 }
174 
175 static int lock_and_set_unaligned_access_static_branch(void)
176 {
177 	cpus_read_lock();
178 	set_unaligned_access_static_branches();
179 	cpus_read_unlock();
180 
181 	return 0;
182 }
183 
184 arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
185 
186 static int riscv_online_cpu(unsigned int cpu)
187 {
188 	static struct page *buf;
189 
190 	/* We are already set since the last check */
191 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
192 		goto exit;
193 
194 	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
195 	if (!buf) {
196 		pr_warn("Allocation failure, not measuring misaligned performance\n");
197 		return -ENOMEM;
198 	}
199 
200 	check_unaligned_access(buf);
201 	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
202 
203 exit:
204 	set_unaligned_access_static_branches();
205 
206 	return 0;
207 }
208 
209 static int riscv_offline_cpu(unsigned int cpu)
210 {
211 	set_unaligned_access_static_branches_except_cpu(cpu);
212 
213 	return 0;
214 }
215 
216 /* Measure unaligned access speed on all CPUs present at boot in parallel. */
217 static int check_unaligned_access_speed_all_cpus(void)
218 {
219 	unsigned int cpu;
220 	unsigned int cpu_count = num_possible_cpus();
221 	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
222 
223 	if (!bufs) {
224 		pr_warn("Allocation failure, not measuring misaligned performance\n");
225 		return 0;
226 	}
227 
228 	/*
229 	 * Allocate separate buffers for each CPU so there's no fighting over
230 	 * cache lines.
231 	 */
232 	for_each_cpu(cpu, cpu_online_mask) {
233 		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
234 		if (!bufs[cpu]) {
235 			pr_warn("Allocation failure, not measuring misaligned performance\n");
236 			goto out;
237 		}
238 	}
239 
240 	/* Check everybody except 0, who stays behind to tend jiffies. */
241 	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
242 
243 	/* Check core 0. */
244 	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
245 
246 	/*
247 	 * Setup hotplug callbacks for any new CPUs that come online or go
248 	 * offline.
249 	 */
250 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
251 				  riscv_online_cpu, riscv_offline_cpu);
252 
253 out:
254 	for_each_cpu(cpu, cpu_online_mask) {
255 		if (bufs[cpu])
256 			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
257 	}
258 
259 	kfree(bufs);
260 	return 0;
261 }
262 
263 static int check_unaligned_access_all_cpus(void)
264 {
265 	bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
266 
267 	if (!all_cpus_emulated)
268 		return check_unaligned_access_speed_all_cpus();
269 
270 	return 0;
271 }
272 #else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
273 static int check_unaligned_access_all_cpus(void)
274 {
275 	check_unaligned_access_emulated_all_cpus();
276 
277 	return 0;
278 }
279 #endif
280 
281 arch_initcall(check_unaligned_access_all_cpus);
282