xref: /linux/tools/testing/selftests/rseq/rseq.c (revision 7f0023215262221ca08d56be2203e8a4770be033)
1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3  * rseq.c
4  *
5  * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; only
10  * version 2.1 of the License.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  */
17 
18 #define _GNU_SOURCE
19 #include <errno.h>
20 #include <sched.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syscall.h>
26 #include <assert.h>
27 #include <signal.h>
28 #include <limits.h>
29 #include <dlfcn.h>
30 #include <stddef.h>
31 #include <sys/auxv.h>
32 #include <linux/auxvec.h>
33 
34 #include <linux/compiler.h>
35 
36 #include "kselftest.h"
37 #include "rseq.h"
38 
39 /*
40  * Define weak versions to play nice with binaries that are statically linked
41  * against a libc that doesn't support registering its own rseq.
42  */
43 extern __weak ptrdiff_t __rseq_offset;
44 extern __weak unsigned int __rseq_size;
45 extern __weak unsigned int __rseq_flags;
46 
47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48 static const unsigned int *libc_rseq_size_p = &__rseq_size;
49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50 
51 /* Offset from the thread pointer to the rseq area. */
52 ptrdiff_t rseq_offset;
53 
54 /*
55  * Size of the registered rseq area. 0 if the registration was
56  * unsuccessful.
57  */
58 unsigned int rseq_size = -1U;
59 static unsigned int rseq_alloc_size;
60 
61 /* Flags used during rseq registration.  */
62 unsigned int rseq_flags;
63 
64 static int rseq_ownership;
65 
66 /* Allocate a large area for the TLS. */
67 #define RSEQ_THREAD_AREA_ALLOC_SIZE	1024
68 
69 /* Original struct rseq feature size is 20 bytes. */
70 #define ORIG_RSEQ_FEATURE_SIZE		20
71 
72 /* Original struct rseq allocation size is 32 bytes. */
73 #define ORIG_RSEQ_ALLOC_SIZE		32
74 
75 /*
76  * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
77  * rseq registration that is larger than the current rseq ABI.
78  */
79 union rseq_tls {
80 	struct rseq_abi abi;
81 	char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
82 };
83 
84 static
85 __thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
86 	.abi = {
87 		.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
88 	},
89 };
90 
sys_rseq(struct rseq_abi * rseq_abi,uint32_t rseq_len,int flags,uint32_t sig)91 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
92 		    int flags, uint32_t sig)
93 {
94 	return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
95 }
96 
sys_getcpu(unsigned * cpu,unsigned * node)97 static int sys_getcpu(unsigned *cpu, unsigned *node)
98 {
99 	return syscall(__NR_getcpu, cpu, node, NULL);
100 }
101 
rseq_available(void)102 bool rseq_available(void)
103 {
104 	int rc;
105 
106 	rc = sys_rseq(NULL, 0, 0, 0);
107 	if (rc != -1)
108 		abort();
109 	switch (errno) {
110 	case ENOSYS:
111 		return false;
112 	case EINVAL:
113 		return true;
114 	default:
115 		abort();
116 	}
117 }
118 
119 /*
120  * Return the feature size supported by the kernel.
121  *
122  * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
123  *
124  *   0: Return ORIG_RSEQ_FEATURE_SIZE (20)
125  * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
126  *
127  * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
128  */
get_rseq_kernel_feature_size(void)129 static unsigned int get_rseq_kernel_feature_size(void)
130 {
131 	unsigned long auxv_rseq_feature_size, auxv_rseq_align;
132 
133 	auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
134 	assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
135 
136 	auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
137 	assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
138 	if (auxv_rseq_feature_size)
139 		return auxv_rseq_feature_size;
140 	else
141 		return ORIG_RSEQ_FEATURE_SIZE;
142 }
143 
__rseq_register_current_thread(bool nolibc,bool legacy)144 int __rseq_register_current_thread(bool nolibc, bool legacy)
145 {
146 	unsigned int size;
147 	int rc;
148 
149 	if (!rseq_ownership) {
150 		/* Treat libc's ownership as a successful registration. */
151 		return nolibc ? -EBUSY : 0;
152 	}
153 
154 	/* The minimal allocation size is 32, which is the legacy allocation size */
155 	size = get_rseq_kernel_feature_size();
156 	if (legacy || size < ORIG_RSEQ_ALLOC_SIZE)
157 		rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE;
158 	else
159 		rseq_alloc_size = size;
160 
161 	rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG);
162 	if (rc) {
163 		/*
164 		 * After at least one thread has registered successfully
165 		 * (rseq_size > 0), the registration of other threads should
166 		 * never fail.
167 		 */
168 		if (RSEQ_READ_ONCE(rseq_size) > 0) {
169 			/* Incoherent success/failure within process. */
170 			abort();
171 		}
172 		return -1;
173 	}
174 	assert(rseq_current_cpu_raw() >= 0);
175 
176 	/*
177 	 * The first thread to register sets the rseq_size to mimic the libc
178 	 * behavior.
179 	 */
180 	if (RSEQ_READ_ONCE(rseq_size) == 0)
181 		RSEQ_WRITE_ONCE(rseq_size, size);
182 
183 	return 0;
184 }
185 
rseq_unregister_current_thread(void)186 int rseq_unregister_current_thread(void)
187 {
188 	int rc;
189 
190 	if (!rseq_ownership) {
191 		/* Treat libc's ownership as a successful unregistration. */
192 		return 0;
193 	}
194 	rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
195 	if (rc)
196 		return -1;
197 	return 0;
198 }
199 
200 static __attribute__((constructor))
rseq_init(void)201 void rseq_init(void)
202 {
203 	/*
204 	 * If the libc's registered rseq size isn't already valid, it may be
205 	 * because the binary is dynamically linked and not necessarily due to
206 	 * libc not having registered a restartable sequence.  Try to find the
207 	 * symbols if that's the case.
208 	 */
209 	if (!libc_rseq_size_p || !*libc_rseq_size_p) {
210 		libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
211 		libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
212 		libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
213 	}
214 	if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
215 			*libc_rseq_size_p != 0) {
216 		unsigned int libc_rseq_size;
217 
218 		/* rseq registration owned by glibc */
219 		rseq_offset = *libc_rseq_offset_p;
220 		libc_rseq_size = *libc_rseq_size_p;
221 		rseq_flags = *libc_rseq_flags_p;
222 
223 		/*
224 		 * Previous versions of glibc expose the value
225 		 * 32 even though the kernel only supported 20
226 		 * bytes initially. Therefore treat 32 as a
227 		 * special-case. glibc 2.40 exposes a 20 bytes
228 		 * __rseq_size without using getauxval(3) to
229 		 * query the supported size, while still allocating a 32
230 		 * bytes area. Also treat 20 as a special-case.
231 		 *
232 		 * Special-cases are handled by using the following
233 		 * value as active feature set size:
234 		 *
235 		 *   rseq_size = min(32, get_rseq_kernel_feature_size())
236 		 */
237 		switch (libc_rseq_size) {
238 		case ORIG_RSEQ_FEATURE_SIZE:
239 			fallthrough;
240 		case ORIG_RSEQ_ALLOC_SIZE:
241 		{
242 			unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
243 
244 			if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
245 				rseq_size = rseq_kernel_feature_size;
246 			else
247 				rseq_size = ORIG_RSEQ_ALLOC_SIZE;
248 			break;
249 		}
250 		default:
251 			/* Otherwise just use the __rseq_size from libc as rseq_size. */
252 			rseq_size = libc_rseq_size;
253 			break;
254 		}
255 		return;
256 	}
257 	rseq_ownership = 1;
258 
259 	/* Calculate the offset of the rseq area from the thread pointer. */
260 	rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer();
261 
262 	/* rseq flags are deprecated, always set to 0. */
263 	rseq_flags = 0;
264 
265 	/*
266 	 * Set the size to 0 until at least one thread registers to mimic the
267 	 * libc behavior.
268 	 */
269 	rseq_size = 0;
270 }
271 
272 static __attribute__((destructor))
rseq_exit(void)273 void rseq_exit(void)
274 {
275 	if (!rseq_ownership)
276 		return;
277 	rseq_offset = 0;
278 	rseq_size = -1U;
279 	rseq_ownership = 0;
280 }
281 
rseq_fallback_current_cpu(void)282 int32_t rseq_fallback_current_cpu(void)
283 {
284 	int32_t cpu;
285 
286 	cpu = sched_getcpu();
287 	if (cpu < 0) {
288 		perror("sched_getcpu()");
289 		abort();
290 	}
291 	return cpu;
292 }
293 
rseq_fallback_current_node(void)294 int32_t rseq_fallback_current_node(void)
295 {
296 	uint32_t cpu_id, node_id;
297 	int ret;
298 
299 	ret = sys_getcpu(&cpu_id, &node_id);
300 	if (ret) {
301 		perror("sys_getcpu()");
302 		return ret;
303 	}
304 	return (int32_t) node_id;
305 }
306