xref: /linux/io_uring/bpf_filter.c (revision 8934827db5403eae57d4537114a9ff88b0a8460f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * BPF filter support for io_uring. Supports SQE opcodes for now.
4  */
5 #include <linux/kernel.h>
6 #include <linux/errno.h>
7 #include <linux/io_uring.h>
8 #include <linux/filter.h>
9 #include <linux/bpf.h>
10 #include <uapi/linux/io_uring.h>
11 
12 #include "io_uring.h"
13 #include "bpf_filter.h"
14 #include "net.h"
15 #include "openclose.h"
16 
17 struct io_bpf_filter {
18 	refcount_t		refs;
19 	struct bpf_prog		*prog;
20 	struct io_bpf_filter	*next;
21 };
22 
23 /* Deny if this is set as the filter */
24 static const struct io_bpf_filter dummy_filter;
25 
io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx * bctx,struct io_kiocb * req)26 static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
27 				      struct io_kiocb *req)
28 {
29 	const struct io_issue_def *def = &io_issue_defs[req->opcode];
30 
31 	bctx->opcode = req->opcode;
32 	bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
33 	bctx->user_data = req->cqe.user_data;
34 	/* clear residual, anything from pdu_size and below */
35 	memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
36 		sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
37 
38 	/*
39 	 * Opcodes can provide a handler for populating more data into bctx,
40 	 * for filters to use.
41 	 */
42 	if (def->filter_pdu_size) {
43 		bctx->pdu_size = def->filter_pdu_size;
44 		def->filter_populate(bctx, req);
45 	}
46 }
47 
48 /*
49  * Run registered filters for a given opcode. For filters, a return of 0 denies
50  * execution of the request, a return of 1 allows it. If any filter for an
51  * opcode returns 0, filter processing is stopped, and the request is denied.
52  * This also stops the processing of filters.
53  *
54  * __io_uring_run_bpf_filters() returns 0 on success, allow running the
55  * request, and -EACCES when a request is denied.
56  */
__io_uring_run_bpf_filters(struct io_bpf_filter __rcu ** filters,struct io_kiocb * req)57 int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
58 			       struct io_kiocb *req)
59 {
60 	struct io_bpf_filter *filter;
61 	struct io_uring_bpf_ctx bpf_ctx;
62 	int ret;
63 
64 	/* Fast check for existence of filters outside of RCU */
65 	if (!rcu_access_pointer(filters[req->opcode]))
66 		return 0;
67 
68 	/*
69 	 * req->opcode has already been validated to be within the range
70 	 * of what we expect, io_init_req() does this.
71 	 */
72 	guard(rcu)();
73 	filter = rcu_dereference(filters[req->opcode]);
74 	if (!filter)
75 		return 0;
76 	else if (filter == &dummy_filter)
77 		return -EACCES;
78 
79 	io_uring_populate_bpf_ctx(&bpf_ctx, req);
80 
81 	/*
82 	 * Iterate registered filters. The opcode is allowed IFF all filters
83 	 * return 1. If any filter returns denied, opcode will be denied.
84 	 */
85 	do {
86 		if (filter == &dummy_filter)
87 			return -EACCES;
88 		ret = bpf_prog_run(filter->prog, &bpf_ctx);
89 		if (!ret)
90 			return -EACCES;
91 		filter = filter->next;
92 	} while (filter);
93 
94 	return 0;
95 }
96 
io_free_bpf_filters(struct rcu_head * head)97 static void io_free_bpf_filters(struct rcu_head *head)
98 {
99 	struct io_bpf_filter __rcu **filter;
100 	struct io_bpf_filters *filters;
101 	int i;
102 
103 	filters = container_of(head, struct io_bpf_filters, rcu_head);
104 	scoped_guard(spinlock, &filters->lock) {
105 		filter = filters->filters;
106 		if (!filter)
107 			return;
108 	}
109 
110 	for (i = 0; i < IORING_OP_LAST; i++) {
111 		struct io_bpf_filter *f;
112 
113 		rcu_read_lock();
114 		f = rcu_dereference(filter[i]);
115 		while (f) {
116 			struct io_bpf_filter *next = f->next;
117 
118 			/*
119 			 * Even if stacked, dummy filter will always be last
120 			 * as it can only get installed into an empty spot.
121 			 */
122 			if (f == &dummy_filter)
123 				break;
124 
125 			/* Someone still holds a ref, stop iterating. */
126 			if (!refcount_dec_and_test(&f->refs))
127 				break;
128 
129 			bpf_prog_destroy(f->prog);
130 			kfree(f);
131 			f = next;
132 		}
133 		rcu_read_unlock();
134 	}
135 	kfree(filters->filters);
136 	kfree(filters);
137 }
138 
__io_put_bpf_filters(struct io_bpf_filters * filters)139 static void __io_put_bpf_filters(struct io_bpf_filters *filters)
140 {
141 	if (refcount_dec_and_test(&filters->refs))
142 		call_rcu(&filters->rcu_head, io_free_bpf_filters);
143 }
144 
io_put_bpf_filters(struct io_restriction * res)145 void io_put_bpf_filters(struct io_restriction *res)
146 {
147 	if (res->bpf_filters)
148 		__io_put_bpf_filters(res->bpf_filters);
149 }
150 
io_new_bpf_filters(void)151 static struct io_bpf_filters *io_new_bpf_filters(void)
152 {
153 	struct io_bpf_filters *filters __free(kfree) = NULL;
154 
155 	filters = kzalloc_obj(*filters, GFP_KERNEL_ACCOUNT);
156 	if (!filters)
157 		return ERR_PTR(-ENOMEM);
158 
159 	filters->filters = kzalloc_objs(struct io_bpf_filter *, IORING_OP_LAST,
160 					GFP_KERNEL_ACCOUNT);
161 	if (!filters->filters)
162 		return ERR_PTR(-ENOMEM);
163 
164 	refcount_set(&filters->refs, 1);
165 	spin_lock_init(&filters->lock);
166 	return no_free_ptr(filters);
167 }
168 
169 /*
170  * Validate classic BPF filter instructions. Only allow a safe subset of
171  * operations - no packet data access, just context field loads and basic
172  * ALU/jump operations.
173  */
io_uring_check_cbpf_filter(struct sock_filter * filter,unsigned int flen)174 static int io_uring_check_cbpf_filter(struct sock_filter *filter,
175 				      unsigned int flen)
176 {
177 	int pc;
178 
179 	for (pc = 0; pc < flen; pc++) {
180 		struct sock_filter *ftest = &filter[pc];
181 		u16 code = ftest->code;
182 		u32 k = ftest->k;
183 
184 		switch (code) {
185 		case BPF_LD | BPF_W | BPF_ABS:
186 			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
187 			/* 32-bit aligned and not out of bounds. */
188 			if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
189 				return -EINVAL;
190 			continue;
191 		case BPF_LD | BPF_W | BPF_LEN:
192 			ftest->code = BPF_LD | BPF_IMM;
193 			ftest->k = sizeof(struct io_uring_bpf_ctx);
194 			continue;
195 		case BPF_LDX | BPF_W | BPF_LEN:
196 			ftest->code = BPF_LDX | BPF_IMM;
197 			ftest->k = sizeof(struct io_uring_bpf_ctx);
198 			continue;
199 		/* Explicitly include allowed calls. */
200 		case BPF_RET | BPF_K:
201 		case BPF_RET | BPF_A:
202 		case BPF_ALU | BPF_ADD | BPF_K:
203 		case BPF_ALU | BPF_ADD | BPF_X:
204 		case BPF_ALU | BPF_SUB | BPF_K:
205 		case BPF_ALU | BPF_SUB | BPF_X:
206 		case BPF_ALU | BPF_MUL | BPF_K:
207 		case BPF_ALU | BPF_MUL | BPF_X:
208 		case BPF_ALU | BPF_DIV | BPF_K:
209 		case BPF_ALU | BPF_DIV | BPF_X:
210 		case BPF_ALU | BPF_AND | BPF_K:
211 		case BPF_ALU | BPF_AND | BPF_X:
212 		case BPF_ALU | BPF_OR | BPF_K:
213 		case BPF_ALU | BPF_OR | BPF_X:
214 		case BPF_ALU | BPF_XOR | BPF_K:
215 		case BPF_ALU | BPF_XOR | BPF_X:
216 		case BPF_ALU | BPF_LSH | BPF_K:
217 		case BPF_ALU | BPF_LSH | BPF_X:
218 		case BPF_ALU | BPF_RSH | BPF_K:
219 		case BPF_ALU | BPF_RSH | BPF_X:
220 		case BPF_ALU | BPF_NEG:
221 		case BPF_LD | BPF_IMM:
222 		case BPF_LDX | BPF_IMM:
223 		case BPF_MISC | BPF_TAX:
224 		case BPF_MISC | BPF_TXA:
225 		case BPF_LD | BPF_MEM:
226 		case BPF_LDX | BPF_MEM:
227 		case BPF_ST:
228 		case BPF_STX:
229 		case BPF_JMP | BPF_JA:
230 		case BPF_JMP | BPF_JEQ | BPF_K:
231 		case BPF_JMP | BPF_JEQ | BPF_X:
232 		case BPF_JMP | BPF_JGE | BPF_K:
233 		case BPF_JMP | BPF_JGE | BPF_X:
234 		case BPF_JMP | BPF_JGT | BPF_K:
235 		case BPF_JMP | BPF_JGT | BPF_X:
236 		case BPF_JMP | BPF_JSET | BPF_K:
237 		case BPF_JMP | BPF_JSET | BPF_X:
238 			continue;
239 		default:
240 			return -EINVAL;
241 		}
242 	}
243 	return 0;
244 }
245 
io_bpf_filter_clone(struct io_restriction * dst,struct io_restriction * src)246 void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
247 {
248 	if (!src->bpf_filters)
249 		return;
250 
251 	rcu_read_lock();
252 	/*
253 	 * If the src filter is going away, just ignore it.
254 	 */
255 	if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
256 		dst->bpf_filters = src->bpf_filters;
257 		dst->bpf_filters_cow = true;
258 	}
259 	rcu_read_unlock();
260 }
261 
262 /*
263  * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
264  * modifications need to be made.
265  */
io_bpf_filter_cow(struct io_restriction * src)266 static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
267 {
268 	struct io_bpf_filters *filters;
269 	struct io_bpf_filter *srcf;
270 	int i;
271 
272 	filters = io_new_bpf_filters();
273 	if (IS_ERR(filters))
274 		return filters;
275 
276 	/*
277 	 * Iterate filters from src and assign in destination. Grabbing
278 	 * a reference is enough, we don't need to duplicate the memory.
279 	 * This is safe because filters are only ever appended to the
280 	 * front of the list, hence the only memory ever touched inside
281 	 * a filter is the refcount.
282 	 */
283 	rcu_read_lock();
284 	for (i = 0; i < IORING_OP_LAST; i++) {
285 		srcf = rcu_dereference(src->bpf_filters->filters[i]);
286 		if (!srcf) {
287 			continue;
288 		} else if (srcf == &dummy_filter) {
289 			rcu_assign_pointer(filters->filters[i], &dummy_filter);
290 			continue;
291 		}
292 
293 		/*
294 		 * Getting a ref on the first node is enough, putting the
295 		 * filter and iterating nodes to free will stop on the first
296 		 * one that doesn't hit zero when dropping.
297 		 */
298 		if (!refcount_inc_not_zero(&srcf->refs))
299 			goto err;
300 		rcu_assign_pointer(filters->filters[i], srcf);
301 	}
302 	rcu_read_unlock();
303 	return filters;
304 err:
305 	rcu_read_unlock();
306 	__io_put_bpf_filters(filters);
307 	return ERR_PTR(-EBUSY);
308 }
309 
310 #define IO_URING_BPF_FILTER_FLAGS	(IO_URING_BPF_FILTER_DENY_REST | \
311 					 IO_URING_BPF_FILTER_SZ_STRICT)
312 
io_bpf_filter_import(struct io_uring_bpf * reg,struct io_uring_bpf __user * arg)313 static int io_bpf_filter_import(struct io_uring_bpf *reg,
314 				struct io_uring_bpf __user *arg)
315 {
316 	const struct io_issue_def *def;
317 	int ret;
318 
319 	if (copy_from_user(reg, arg, sizeof(*reg)))
320 		return -EFAULT;
321 	if (reg->cmd_type != IO_URING_BPF_CMD_FILTER)
322 		return -EINVAL;
323 	if (reg->cmd_flags || reg->resv)
324 		return -EINVAL;
325 
326 	if (reg->filter.opcode >= IORING_OP_LAST)
327 		return -EINVAL;
328 	if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
329 		return -EINVAL;
330 	if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv)))
331 		return -EINVAL;
332 	if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2)))
333 		return -EINVAL;
334 	if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS)
335 		return -EINVAL;
336 
337 	/* Verify filter size */
338 	def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)];
339 
340 	/* same size, always ok */
341 	ret = 0;
342 	if (reg->filter.pdu_size == def->filter_pdu_size)
343 		;
344 	/* size differs, fail in strict mode */
345 	else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT)
346 		ret = -EMSGSIZE;
347 	/* userspace filter is bigger, always disallow */
348 	else if (reg->filter.pdu_size > def->filter_pdu_size)
349 		ret = -EMSGSIZE;
350 
351 	/* copy back kernel filter size */
352 	reg->filter.pdu_size = def->filter_pdu_size;
353 	if (copy_to_user(&arg->filter, &reg->filter, sizeof(reg->filter)))
354 		return -EFAULT;
355 
356 	return ret;
357 }
358 
io_register_bpf_filter(struct io_restriction * res,struct io_uring_bpf __user * arg)359 int io_register_bpf_filter(struct io_restriction *res,
360 			   struct io_uring_bpf __user *arg)
361 {
362 	struct io_bpf_filters *filters, *old_filters = NULL;
363 	struct io_bpf_filter *filter, *old_filter;
364 	struct io_uring_bpf reg;
365 	struct bpf_prog *prog;
366 	struct sock_fprog fprog;
367 	int ret;
368 
369 	ret = io_bpf_filter_import(&reg, arg);
370 	if (ret)
371 		return ret;
372 
373 	fprog.len = reg.filter.filter_len;
374 	fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
375 
376 	ret = bpf_prog_create_from_user(&prog, &fprog,
377 					io_uring_check_cbpf_filter, false);
378 	if (ret)
379 		return ret;
380 
381 	/*
382 	 * No existing filters, allocate set.
383 	 */
384 	filters = res->bpf_filters;
385 	if (!filters) {
386 		filters = io_new_bpf_filters();
387 		if (IS_ERR(filters)) {
388 			ret = PTR_ERR(filters);
389 			goto err_prog;
390 		}
391 	} else if (res->bpf_filters_cow) {
392 		filters = io_bpf_filter_cow(res);
393 		if (IS_ERR(filters)) {
394 			ret = PTR_ERR(filters);
395 			goto err_prog;
396 		}
397 		/*
398 		 * Stash old filters, we'll put them once we know we'll
399 		 * succeed. Until then, res->bpf_filters is left untouched.
400 		 */
401 		old_filters = res->bpf_filters;
402 	}
403 
404 	filter = kzalloc_obj(*filter, GFP_KERNEL_ACCOUNT);
405 	if (!filter) {
406 		ret = -ENOMEM;
407 		goto err;
408 	}
409 	refcount_set(&filter->refs, 1);
410 	filter->prog = prog;
411 
412 	/*
413 	 * Success - install the new filter set now. If we did COW, put
414 	 * the old filters as we're replacing them.
415 	 */
416 	if (old_filters) {
417 		__io_put_bpf_filters(old_filters);
418 		res->bpf_filters_cow = false;
419 	}
420 	res->bpf_filters = filters;
421 
422 	/*
423 	 * Insert filter - if the current opcode already has a filter
424 	 * attached, add to the set.
425 	 */
426 	rcu_read_lock();
427 	spin_lock_bh(&filters->lock);
428 	old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
429 	if (old_filter)
430 		filter->next = old_filter;
431 	rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);
432 
433 	/*
434 	 * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
435 	 * opcode with the dummy filter. That will cause them to be denied.
436 	 */
437 	if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
438 		for (int i = 0; i < IORING_OP_LAST; i++) {
439 			if (i == reg.filter.opcode)
440 				continue;
441 			old_filter = rcu_dereference(filters->filters[i]);
442 			if (old_filter)
443 				continue;
444 			rcu_assign_pointer(filters->filters[i], &dummy_filter);
445 		}
446 	}
447 
448 	spin_unlock_bh(&filters->lock);
449 	rcu_read_unlock();
450 	return 0;
451 err:
452 	if (filters != res->bpf_filters)
453 		__io_put_bpf_filters(filters);
454 err_prog:
455 	bpf_prog_destroy(prog);
456 	return ret;
457 }
458