1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * BPF filter support for io_uring. Supports SQE opcodes for now.
4 */
5 #include <linux/kernel.h>
6 #include <linux/errno.h>
7 #include <linux/io_uring.h>
8 #include <linux/filter.h>
9 #include <linux/bpf.h>
10 #include <uapi/linux/io_uring.h>
11
12 #include "io_uring.h"
13 #include "bpf_filter.h"
14 #include "net.h"
15 #include "openclose.h"
16
17 struct io_bpf_filter {
18 refcount_t refs;
19 struct bpf_prog *prog;
20 struct io_bpf_filter *next;
21 };
22
23 /* Deny if this is set as the filter */
24 static const struct io_bpf_filter dummy_filter;
25
io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx * bctx,struct io_kiocb * req)26 static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
27 struct io_kiocb *req)
28 {
29 const struct io_issue_def *def = &io_issue_defs[req->opcode];
30
31 bctx->opcode = req->opcode;
32 bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
33 bctx->user_data = req->cqe.user_data;
34 /* clear residual, anything from pdu_size and below */
35 memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
36 sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
37
38 /*
39 * Opcodes can provide a handler for populating more data into bctx,
40 * for filters to use.
41 */
42 if (def->filter_pdu_size) {
43 bctx->pdu_size = def->filter_pdu_size;
44 def->filter_populate(bctx, req);
45 }
46 }
47
48 /*
49 * Run registered filters for a given opcode. For filters, a return of 0 denies
50 * execution of the request, a return of 1 allows it. If any filter for an
51 * opcode returns 0, filter processing is stopped, and the request is denied.
52 * This also stops the processing of filters.
53 *
54 * __io_uring_run_bpf_filters() returns 0 on success, allow running the
55 * request, and -EACCES when a request is denied.
56 */
__io_uring_run_bpf_filters(struct io_bpf_filter __rcu ** filters,struct io_kiocb * req)57 int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
58 struct io_kiocb *req)
59 {
60 struct io_bpf_filter *filter;
61 struct io_uring_bpf_ctx bpf_ctx;
62 int ret;
63
64 /* Fast check for existence of filters outside of RCU */
65 if (!rcu_access_pointer(filters[req->opcode]))
66 return 0;
67
68 /*
69 * req->opcode has already been validated to be within the range
70 * of what we expect, io_init_req() does this.
71 */
72 guard(rcu)();
73 filter = rcu_dereference(filters[req->opcode]);
74 if (!filter)
75 return 0;
76 else if (filter == &dummy_filter)
77 return -EACCES;
78
79 io_uring_populate_bpf_ctx(&bpf_ctx, req);
80
81 /*
82 * Iterate registered filters. The opcode is allowed IFF all filters
83 * return 1. If any filter returns denied, opcode will be denied.
84 */
85 do {
86 if (filter == &dummy_filter)
87 return -EACCES;
88 ret = bpf_prog_run(filter->prog, &bpf_ctx);
89 if (!ret)
90 return -EACCES;
91 filter = filter->next;
92 } while (filter);
93
94 return 0;
95 }
96
io_free_bpf_filters(struct rcu_head * head)97 static void io_free_bpf_filters(struct rcu_head *head)
98 {
99 struct io_bpf_filter __rcu **filter;
100 struct io_bpf_filters *filters;
101 int i;
102
103 filters = container_of(head, struct io_bpf_filters, rcu_head);
104 scoped_guard(spinlock, &filters->lock) {
105 filter = filters->filters;
106 if (!filter)
107 return;
108 }
109
110 for (i = 0; i < IORING_OP_LAST; i++) {
111 struct io_bpf_filter *f;
112
113 rcu_read_lock();
114 f = rcu_dereference(filter[i]);
115 while (f) {
116 struct io_bpf_filter *next = f->next;
117
118 /*
119 * Even if stacked, dummy filter will always be last
120 * as it can only get installed into an empty spot.
121 */
122 if (f == &dummy_filter)
123 break;
124
125 /* Someone still holds a ref, stop iterating. */
126 if (!refcount_dec_and_test(&f->refs))
127 break;
128
129 bpf_prog_destroy(f->prog);
130 kfree(f);
131 f = next;
132 }
133 rcu_read_unlock();
134 }
135 kfree(filters->filters);
136 kfree(filters);
137 }
138
__io_put_bpf_filters(struct io_bpf_filters * filters)139 static void __io_put_bpf_filters(struct io_bpf_filters *filters)
140 {
141 if (refcount_dec_and_test(&filters->refs))
142 call_rcu(&filters->rcu_head, io_free_bpf_filters);
143 }
144
io_put_bpf_filters(struct io_restriction * res)145 void io_put_bpf_filters(struct io_restriction *res)
146 {
147 if (res->bpf_filters)
148 __io_put_bpf_filters(res->bpf_filters);
149 }
150
io_new_bpf_filters(void)151 static struct io_bpf_filters *io_new_bpf_filters(void)
152 {
153 struct io_bpf_filters *filters __free(kfree) = NULL;
154
155 filters = kzalloc_obj(*filters, GFP_KERNEL_ACCOUNT);
156 if (!filters)
157 return ERR_PTR(-ENOMEM);
158
159 filters->filters = kzalloc_objs(struct io_bpf_filter *, IORING_OP_LAST,
160 GFP_KERNEL_ACCOUNT);
161 if (!filters->filters)
162 return ERR_PTR(-ENOMEM);
163
164 refcount_set(&filters->refs, 1);
165 spin_lock_init(&filters->lock);
166 return no_free_ptr(filters);
167 }
168
169 /*
170 * Validate classic BPF filter instructions. Only allow a safe subset of
171 * operations - no packet data access, just context field loads and basic
172 * ALU/jump operations.
173 */
io_uring_check_cbpf_filter(struct sock_filter * filter,unsigned int flen)174 static int io_uring_check_cbpf_filter(struct sock_filter *filter,
175 unsigned int flen)
176 {
177 int pc;
178
179 for (pc = 0; pc < flen; pc++) {
180 struct sock_filter *ftest = &filter[pc];
181 u16 code = ftest->code;
182 u32 k = ftest->k;
183
184 switch (code) {
185 case BPF_LD | BPF_W | BPF_ABS:
186 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
187 /* 32-bit aligned and not out of bounds. */
188 if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
189 return -EINVAL;
190 continue;
191 case BPF_LD | BPF_W | BPF_LEN:
192 ftest->code = BPF_LD | BPF_IMM;
193 ftest->k = sizeof(struct io_uring_bpf_ctx);
194 continue;
195 case BPF_LDX | BPF_W | BPF_LEN:
196 ftest->code = BPF_LDX | BPF_IMM;
197 ftest->k = sizeof(struct io_uring_bpf_ctx);
198 continue;
199 /* Explicitly include allowed calls. */
200 case BPF_RET | BPF_K:
201 case BPF_RET | BPF_A:
202 case BPF_ALU | BPF_ADD | BPF_K:
203 case BPF_ALU | BPF_ADD | BPF_X:
204 case BPF_ALU | BPF_SUB | BPF_K:
205 case BPF_ALU | BPF_SUB | BPF_X:
206 case BPF_ALU | BPF_MUL | BPF_K:
207 case BPF_ALU | BPF_MUL | BPF_X:
208 case BPF_ALU | BPF_DIV | BPF_K:
209 case BPF_ALU | BPF_DIV | BPF_X:
210 case BPF_ALU | BPF_AND | BPF_K:
211 case BPF_ALU | BPF_AND | BPF_X:
212 case BPF_ALU | BPF_OR | BPF_K:
213 case BPF_ALU | BPF_OR | BPF_X:
214 case BPF_ALU | BPF_XOR | BPF_K:
215 case BPF_ALU | BPF_XOR | BPF_X:
216 case BPF_ALU | BPF_LSH | BPF_K:
217 case BPF_ALU | BPF_LSH | BPF_X:
218 case BPF_ALU | BPF_RSH | BPF_K:
219 case BPF_ALU | BPF_RSH | BPF_X:
220 case BPF_ALU | BPF_NEG:
221 case BPF_LD | BPF_IMM:
222 case BPF_LDX | BPF_IMM:
223 case BPF_MISC | BPF_TAX:
224 case BPF_MISC | BPF_TXA:
225 case BPF_LD | BPF_MEM:
226 case BPF_LDX | BPF_MEM:
227 case BPF_ST:
228 case BPF_STX:
229 case BPF_JMP | BPF_JA:
230 case BPF_JMP | BPF_JEQ | BPF_K:
231 case BPF_JMP | BPF_JEQ | BPF_X:
232 case BPF_JMP | BPF_JGE | BPF_K:
233 case BPF_JMP | BPF_JGE | BPF_X:
234 case BPF_JMP | BPF_JGT | BPF_K:
235 case BPF_JMP | BPF_JGT | BPF_X:
236 case BPF_JMP | BPF_JSET | BPF_K:
237 case BPF_JMP | BPF_JSET | BPF_X:
238 continue;
239 default:
240 return -EINVAL;
241 }
242 }
243 return 0;
244 }
245
io_bpf_filter_clone(struct io_restriction * dst,struct io_restriction * src)246 void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
247 {
248 if (!src->bpf_filters)
249 return;
250
251 rcu_read_lock();
252 /*
253 * If the src filter is going away, just ignore it.
254 */
255 if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
256 dst->bpf_filters = src->bpf_filters;
257 dst->bpf_filters_cow = true;
258 }
259 rcu_read_unlock();
260 }
261
262 /*
263 * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
264 * modifications need to be made.
265 */
io_bpf_filter_cow(struct io_restriction * src)266 static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
267 {
268 struct io_bpf_filters *filters;
269 struct io_bpf_filter *srcf;
270 int i;
271
272 filters = io_new_bpf_filters();
273 if (IS_ERR(filters))
274 return filters;
275
276 /*
277 * Iterate filters from src and assign in destination. Grabbing
278 * a reference is enough, we don't need to duplicate the memory.
279 * This is safe because filters are only ever appended to the
280 * front of the list, hence the only memory ever touched inside
281 * a filter is the refcount.
282 */
283 rcu_read_lock();
284 for (i = 0; i < IORING_OP_LAST; i++) {
285 srcf = rcu_dereference(src->bpf_filters->filters[i]);
286 if (!srcf) {
287 continue;
288 } else if (srcf == &dummy_filter) {
289 rcu_assign_pointer(filters->filters[i], &dummy_filter);
290 continue;
291 }
292
293 /*
294 * Getting a ref on the first node is enough, putting the
295 * filter and iterating nodes to free will stop on the first
296 * one that doesn't hit zero when dropping.
297 */
298 if (!refcount_inc_not_zero(&srcf->refs))
299 goto err;
300 rcu_assign_pointer(filters->filters[i], srcf);
301 }
302 rcu_read_unlock();
303 return filters;
304 err:
305 rcu_read_unlock();
306 __io_put_bpf_filters(filters);
307 return ERR_PTR(-EBUSY);
308 }
309
310 #define IO_URING_BPF_FILTER_FLAGS (IO_URING_BPF_FILTER_DENY_REST | \
311 IO_URING_BPF_FILTER_SZ_STRICT)
312
io_bpf_filter_import(struct io_uring_bpf * reg,struct io_uring_bpf __user * arg)313 static int io_bpf_filter_import(struct io_uring_bpf *reg,
314 struct io_uring_bpf __user *arg)
315 {
316 const struct io_issue_def *def;
317 int ret;
318
319 if (copy_from_user(reg, arg, sizeof(*reg)))
320 return -EFAULT;
321 if (reg->cmd_type != IO_URING_BPF_CMD_FILTER)
322 return -EINVAL;
323 if (reg->cmd_flags || reg->resv)
324 return -EINVAL;
325
326 if (reg->filter.opcode >= IORING_OP_LAST)
327 return -EINVAL;
328 if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
329 return -EINVAL;
330 if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv)))
331 return -EINVAL;
332 if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2)))
333 return -EINVAL;
334 if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS)
335 return -EINVAL;
336
337 /* Verify filter size */
338 def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)];
339
340 /* same size, always ok */
341 ret = 0;
342 if (reg->filter.pdu_size == def->filter_pdu_size)
343 ;
344 /* size differs, fail in strict mode */
345 else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT)
346 ret = -EMSGSIZE;
347 /* userspace filter is bigger, always disallow */
348 else if (reg->filter.pdu_size > def->filter_pdu_size)
349 ret = -EMSGSIZE;
350
351 /* copy back kernel filter size */
352 reg->filter.pdu_size = def->filter_pdu_size;
353 if (copy_to_user(&arg->filter, ®->filter, sizeof(reg->filter)))
354 return -EFAULT;
355
356 return ret;
357 }
358
io_register_bpf_filter(struct io_restriction * res,struct io_uring_bpf __user * arg)359 int io_register_bpf_filter(struct io_restriction *res,
360 struct io_uring_bpf __user *arg)
361 {
362 struct io_bpf_filters *filters, *old_filters = NULL;
363 struct io_bpf_filter *filter, *old_filter;
364 struct io_uring_bpf reg;
365 struct bpf_prog *prog;
366 struct sock_fprog fprog;
367 int ret;
368
369 ret = io_bpf_filter_import(®, arg);
370 if (ret)
371 return ret;
372
373 fprog.len = reg.filter.filter_len;
374 fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
375
376 ret = bpf_prog_create_from_user(&prog, &fprog,
377 io_uring_check_cbpf_filter, false);
378 if (ret)
379 return ret;
380
381 /*
382 * No existing filters, allocate set.
383 */
384 filters = res->bpf_filters;
385 if (!filters) {
386 filters = io_new_bpf_filters();
387 if (IS_ERR(filters)) {
388 ret = PTR_ERR(filters);
389 goto err_prog;
390 }
391 } else if (res->bpf_filters_cow) {
392 filters = io_bpf_filter_cow(res);
393 if (IS_ERR(filters)) {
394 ret = PTR_ERR(filters);
395 goto err_prog;
396 }
397 /*
398 * Stash old filters, we'll put them once we know we'll
399 * succeed. Until then, res->bpf_filters is left untouched.
400 */
401 old_filters = res->bpf_filters;
402 }
403
404 filter = kzalloc_obj(*filter, GFP_KERNEL_ACCOUNT);
405 if (!filter) {
406 ret = -ENOMEM;
407 goto err;
408 }
409 refcount_set(&filter->refs, 1);
410 filter->prog = prog;
411
412 /*
413 * Success - install the new filter set now. If we did COW, put
414 * the old filters as we're replacing them.
415 */
416 if (old_filters) {
417 __io_put_bpf_filters(old_filters);
418 res->bpf_filters_cow = false;
419 }
420 res->bpf_filters = filters;
421
422 /*
423 * Insert filter - if the current opcode already has a filter
424 * attached, add to the set.
425 */
426 rcu_read_lock();
427 spin_lock_bh(&filters->lock);
428 old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
429 if (old_filter)
430 filter->next = old_filter;
431 rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);
432
433 /*
434 * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
435 * opcode with the dummy filter. That will cause them to be denied.
436 */
437 if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
438 for (int i = 0; i < IORING_OP_LAST; i++) {
439 if (i == reg.filter.opcode)
440 continue;
441 old_filter = rcu_dereference(filters->filters[i]);
442 if (old_filter)
443 continue;
444 rcu_assign_pointer(filters->filters[i], &dummy_filter);
445 }
446 }
447
448 spin_unlock_bh(&filters->lock);
449 rcu_read_unlock();
450 return 0;
451 err:
452 if (filters != res->bpf_filters)
453 __io_put_bpf_filters(filters);
454 err_prog:
455 bpf_prog_destroy(prog);
456 return ret;
457 }
458