1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3 * rseq.c
4 *
5 * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; only
10 * version 2.1 of the License.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 */
17
18 #define _GNU_SOURCE
19 #include <errno.h>
20 #include <sched.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syscall.h>
26 #include <assert.h>
27 #include <signal.h>
28 #include <limits.h>
29 #include <dlfcn.h>
30 #include <stddef.h>
31 #include <sys/auxv.h>
32 #include <linux/auxvec.h>
33
34 #include <linux/compiler.h>
35
36 #include "kselftest.h"
37 #include "rseq.h"
38
39 /*
40 * Define weak versions to play nice with binaries that are statically linked
41 * against a libc that doesn't support registering its own rseq.
42 */
43 extern __weak ptrdiff_t __rseq_offset;
44 extern __weak unsigned int __rseq_size;
45 extern __weak unsigned int __rseq_flags;
46
47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48 static const unsigned int *libc_rseq_size_p = &__rseq_size;
49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50
51 /* Offset from the thread pointer to the rseq area. */
52 ptrdiff_t rseq_offset;
53
54 /*
55 * Size of the registered rseq area. 0 if the registration was
56 * unsuccessful.
57 */
58 unsigned int rseq_size = -1U;
59 static unsigned int rseq_alloc_size;
60
61 /* Flags used during rseq registration. */
62 unsigned int rseq_flags;
63
64 static int rseq_ownership;
65
66 /* Allocate a large area for the TLS. */
67 #define RSEQ_THREAD_AREA_ALLOC_SIZE 1024
68
69 /* Original struct rseq feature size is 20 bytes. */
70 #define ORIG_RSEQ_FEATURE_SIZE 20
71
72 /* Original struct rseq allocation size is 32 bytes. */
73 #define ORIG_RSEQ_ALLOC_SIZE 32
74
75 /*
76 * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
77 * rseq registration that is larger than the current rseq ABI.
78 */
79 union rseq_tls {
80 struct rseq_abi abi;
81 char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
82 };
83
84 static
85 __thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
86 .abi = {
87 .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
88 },
89 };
90
sys_rseq(struct rseq_abi * rseq_abi,uint32_t rseq_len,int flags,uint32_t sig)91 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
92 int flags, uint32_t sig)
93 {
94 return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
95 }
96
sys_getcpu(unsigned * cpu,unsigned * node)97 static int sys_getcpu(unsigned *cpu, unsigned *node)
98 {
99 return syscall(__NR_getcpu, cpu, node, NULL);
100 }
101
rseq_available(void)102 bool rseq_available(void)
103 {
104 int rc;
105
106 rc = sys_rseq(NULL, 0, 0, 0);
107 if (rc != -1)
108 abort();
109 switch (errno) {
110 case ENOSYS:
111 return false;
112 case EINVAL:
113 return true;
114 default:
115 abort();
116 }
117 }
118
119 /*
120 * Return the feature size supported by the kernel.
121 *
122 * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
123 *
124 * 0: Return ORIG_RSEQ_FEATURE_SIZE (20)
125 * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
126 *
127 * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
128 */
get_rseq_kernel_feature_size(void)129 static unsigned int get_rseq_kernel_feature_size(void)
130 {
131 unsigned long auxv_rseq_feature_size, auxv_rseq_align;
132
133 auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
134 assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
135
136 auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
137 assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
138 if (auxv_rseq_feature_size)
139 return auxv_rseq_feature_size;
140 else
141 return ORIG_RSEQ_FEATURE_SIZE;
142 }
143
__rseq_register_current_thread(bool nolibc,bool legacy)144 int __rseq_register_current_thread(bool nolibc, bool legacy)
145 {
146 unsigned int size;
147 int rc;
148
149 if (!rseq_ownership) {
150 /* Treat libc's ownership as a successful registration. */
151 return nolibc ? -EBUSY : 0;
152 }
153
154 /* The minimal allocation size is 32, which is the legacy allocation size */
155 size = get_rseq_kernel_feature_size();
156 if (legacy || size < ORIG_RSEQ_ALLOC_SIZE)
157 rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE;
158 else
159 rseq_alloc_size = size;
160
161 rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG);
162 if (rc) {
163 /*
164 * After at least one thread has registered successfully
165 * (rseq_size > 0), the registration of other threads should
166 * never fail.
167 */
168 if (RSEQ_READ_ONCE(rseq_size) > 0) {
169 /* Incoherent success/failure within process. */
170 abort();
171 }
172 return -1;
173 }
174 assert(rseq_current_cpu_raw() >= 0);
175
176 /*
177 * The first thread to register sets the rseq_size to mimic the libc
178 * behavior.
179 */
180 if (RSEQ_READ_ONCE(rseq_size) == 0)
181 RSEQ_WRITE_ONCE(rseq_size, size);
182
183 return 0;
184 }
185
rseq_unregister_current_thread(void)186 int rseq_unregister_current_thread(void)
187 {
188 int rc;
189
190 if (!rseq_ownership) {
191 /* Treat libc's ownership as a successful unregistration. */
192 return 0;
193 }
194 rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
195 if (rc)
196 return -1;
197 return 0;
198 }
199
200 static __attribute__((constructor))
rseq_init(void)201 void rseq_init(void)
202 {
203 /*
204 * If the libc's registered rseq size isn't already valid, it may be
205 * because the binary is dynamically linked and not necessarily due to
206 * libc not having registered a restartable sequence. Try to find the
207 * symbols if that's the case.
208 */
209 if (!libc_rseq_size_p || !*libc_rseq_size_p) {
210 libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
211 libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
212 libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
213 }
214 if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
215 *libc_rseq_size_p != 0) {
216 unsigned int libc_rseq_size;
217
218 /* rseq registration owned by glibc */
219 rseq_offset = *libc_rseq_offset_p;
220 libc_rseq_size = *libc_rseq_size_p;
221 rseq_flags = *libc_rseq_flags_p;
222
223 /*
224 * Previous versions of glibc expose the value
225 * 32 even though the kernel only supported 20
226 * bytes initially. Therefore treat 32 as a
227 * special-case. glibc 2.40 exposes a 20 bytes
228 * __rseq_size without using getauxval(3) to
229 * query the supported size, while still allocating a 32
230 * bytes area. Also treat 20 as a special-case.
231 *
232 * Special-cases are handled by using the following
233 * value as active feature set size:
234 *
235 * rseq_size = min(32, get_rseq_kernel_feature_size())
236 */
237 switch (libc_rseq_size) {
238 case ORIG_RSEQ_FEATURE_SIZE:
239 fallthrough;
240 case ORIG_RSEQ_ALLOC_SIZE:
241 {
242 unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
243
244 if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
245 rseq_size = rseq_kernel_feature_size;
246 else
247 rseq_size = ORIG_RSEQ_ALLOC_SIZE;
248 break;
249 }
250 default:
251 /* Otherwise just use the __rseq_size from libc as rseq_size. */
252 rseq_size = libc_rseq_size;
253 break;
254 }
255 return;
256 }
257 rseq_ownership = 1;
258
259 /* Calculate the offset of the rseq area from the thread pointer. */
260 rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer();
261
262 /* rseq flags are deprecated, always set to 0. */
263 rseq_flags = 0;
264
265 /*
266 * Set the size to 0 until at least one thread registers to mimic the
267 * libc behavior.
268 */
269 rseq_size = 0;
270 }
271
272 static __attribute__((destructor))
rseq_exit(void)273 void rseq_exit(void)
274 {
275 if (!rseq_ownership)
276 return;
277 rseq_offset = 0;
278 rseq_size = -1U;
279 rseq_ownership = 0;
280 }
281
rseq_fallback_current_cpu(void)282 int32_t rseq_fallback_current_cpu(void)
283 {
284 int32_t cpu;
285
286 cpu = sched_getcpu();
287 if (cpu < 0) {
288 perror("sched_getcpu()");
289 abort();
290 }
291 return cpu;
292 }
293
rseq_fallback_current_node(void)294 int32_t rseq_fallback_current_node(void)
295 {
296 uint32_t cpu_id, node_id;
297 int ret;
298
299 ret = sys_getcpu(&cpu_id, &node_id);
300 if (ret) {
301 perror("sys_getcpu()");
302 return ret;
303 }
304 return (int32_t) node_id;
305 }
306