xref: /linux/kernel/bpf/syscall.c (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2  *
3  * This program is free software; you can redistribute it and/or
4  * modify it under the terms of version 2 of the GNU General Public
5  * License as published by the Free Software Foundation.
6  *
7  * This program is distributed in the hope that it will be useful, but
8  * WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10  * General Public License for more details.
11  */
12 #include <linux/bpf.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/anon_inodes.h>
16 #include <linux/file.h>
17 #include <linux/license.h>
18 #include <linux/filter.h>
19 #include <linux/version.h>
20 
21 static LIST_HEAD(bpf_map_types);
22 
23 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
24 {
25 	struct bpf_map_type_list *tl;
26 	struct bpf_map *map;
27 
28 	list_for_each_entry(tl, &bpf_map_types, list_node) {
29 		if (tl->type == attr->map_type) {
30 			map = tl->ops->map_alloc(attr);
31 			if (IS_ERR(map))
32 				return map;
33 			map->ops = tl->ops;
34 			map->map_type = attr->map_type;
35 			return map;
36 		}
37 	}
38 	return ERR_PTR(-EINVAL);
39 }
40 
41 /* boot time registration of different map implementations */
42 void bpf_register_map_type(struct bpf_map_type_list *tl)
43 {
44 	list_add(&tl->list_node, &bpf_map_types);
45 }
46 
47 /* called from workqueue */
48 static void bpf_map_free_deferred(struct work_struct *work)
49 {
50 	struct bpf_map *map = container_of(work, struct bpf_map, work);
51 
52 	/* implementation dependent freeing */
53 	map->ops->map_free(map);
54 }
55 
56 /* decrement map refcnt and schedule it for freeing via workqueue
57  * (unrelying map implementation ops->map_free() might sleep)
58  */
59 void bpf_map_put(struct bpf_map *map)
60 {
61 	if (atomic_dec_and_test(&map->refcnt)) {
62 		INIT_WORK(&map->work, bpf_map_free_deferred);
63 		schedule_work(&map->work);
64 	}
65 }
66 
67 static int bpf_map_release(struct inode *inode, struct file *filp)
68 {
69 	struct bpf_map *map = filp->private_data;
70 
71 	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
72 		/* prog_array stores refcnt-ed bpf_prog pointers
73 		 * release them all when user space closes prog_array_fd
74 		 */
75 		bpf_fd_array_map_clear(map);
76 
77 	bpf_map_put(map);
78 	return 0;
79 }
80 
81 static const struct file_operations bpf_map_fops = {
82 	.release = bpf_map_release,
83 };
84 
85 /* helper macro to check that unused fields 'union bpf_attr' are zero */
86 #define CHECK_ATTR(CMD) \
87 	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
88 		   sizeof(attr->CMD##_LAST_FIELD), 0, \
89 		   sizeof(*attr) - \
90 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
91 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
92 
93 #define BPF_MAP_CREATE_LAST_FIELD max_entries
94 /* called via syscall */
95 static int map_create(union bpf_attr *attr)
96 {
97 	struct bpf_map *map;
98 	int err;
99 
100 	err = CHECK_ATTR(BPF_MAP_CREATE);
101 	if (err)
102 		return -EINVAL;
103 
104 	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
105 	map = find_and_alloc_map(attr);
106 	if (IS_ERR(map))
107 		return PTR_ERR(map);
108 
109 	atomic_set(&map->refcnt, 1);
110 
111 	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
112 
113 	if (err < 0)
114 		/* failed to allocate fd */
115 		goto free_map;
116 
117 	return err;
118 
119 free_map:
120 	map->ops->map_free(map);
121 	return err;
122 }
123 
124 /* if error is returned, fd is released.
125  * On success caller should complete fd access with matching fdput()
126  */
127 struct bpf_map *bpf_map_get(struct fd f)
128 {
129 	struct bpf_map *map;
130 
131 	if (!f.file)
132 		return ERR_PTR(-EBADF);
133 
134 	if (f.file->f_op != &bpf_map_fops) {
135 		fdput(f);
136 		return ERR_PTR(-EINVAL);
137 	}
138 
139 	map = f.file->private_data;
140 
141 	return map;
142 }
143 
144 /* helper to convert user pointers passed inside __aligned_u64 fields */
145 static void __user *u64_to_ptr(__u64 val)
146 {
147 	return (void __user *) (unsigned long) val;
148 }
149 
150 /* last field in 'union bpf_attr' used by this command */
151 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
152 
153 static int map_lookup_elem(union bpf_attr *attr)
154 {
155 	void __user *ukey = u64_to_ptr(attr->key);
156 	void __user *uvalue = u64_to_ptr(attr->value);
157 	int ufd = attr->map_fd;
158 	struct bpf_map *map;
159 	void *key, *value, *ptr;
160 	struct fd f;
161 	int err;
162 
163 	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
164 		return -EINVAL;
165 
166 	f = fdget(ufd);
167 	map = bpf_map_get(f);
168 	if (IS_ERR(map))
169 		return PTR_ERR(map);
170 
171 	err = -ENOMEM;
172 	key = kmalloc(map->key_size, GFP_USER);
173 	if (!key)
174 		goto err_put;
175 
176 	err = -EFAULT;
177 	if (copy_from_user(key, ukey, map->key_size) != 0)
178 		goto free_key;
179 
180 	err = -ENOMEM;
181 	value = kmalloc(map->value_size, GFP_USER);
182 	if (!value)
183 		goto free_key;
184 
185 	rcu_read_lock();
186 	ptr = map->ops->map_lookup_elem(map, key);
187 	if (ptr)
188 		memcpy(value, ptr, map->value_size);
189 	rcu_read_unlock();
190 
191 	err = -ENOENT;
192 	if (!ptr)
193 		goto free_value;
194 
195 	err = -EFAULT;
196 	if (copy_to_user(uvalue, value, map->value_size) != 0)
197 		goto free_value;
198 
199 	err = 0;
200 
201 free_value:
202 	kfree(value);
203 free_key:
204 	kfree(key);
205 err_put:
206 	fdput(f);
207 	return err;
208 }
209 
210 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
211 
212 static int map_update_elem(union bpf_attr *attr)
213 {
214 	void __user *ukey = u64_to_ptr(attr->key);
215 	void __user *uvalue = u64_to_ptr(attr->value);
216 	int ufd = attr->map_fd;
217 	struct bpf_map *map;
218 	void *key, *value;
219 	struct fd f;
220 	int err;
221 
222 	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
223 		return -EINVAL;
224 
225 	f = fdget(ufd);
226 	map = bpf_map_get(f);
227 	if (IS_ERR(map))
228 		return PTR_ERR(map);
229 
230 	err = -ENOMEM;
231 	key = kmalloc(map->key_size, GFP_USER);
232 	if (!key)
233 		goto err_put;
234 
235 	err = -EFAULT;
236 	if (copy_from_user(key, ukey, map->key_size) != 0)
237 		goto free_key;
238 
239 	err = -ENOMEM;
240 	value = kmalloc(map->value_size, GFP_USER);
241 	if (!value)
242 		goto free_key;
243 
244 	err = -EFAULT;
245 	if (copy_from_user(value, uvalue, map->value_size) != 0)
246 		goto free_value;
247 
248 	/* eBPF program that use maps are running under rcu_read_lock(),
249 	 * therefore all map accessors rely on this fact, so do the same here
250 	 */
251 	rcu_read_lock();
252 	err = map->ops->map_update_elem(map, key, value, attr->flags);
253 	rcu_read_unlock();
254 
255 free_value:
256 	kfree(value);
257 free_key:
258 	kfree(key);
259 err_put:
260 	fdput(f);
261 	return err;
262 }
263 
264 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
265 
266 static int map_delete_elem(union bpf_attr *attr)
267 {
268 	void __user *ukey = u64_to_ptr(attr->key);
269 	int ufd = attr->map_fd;
270 	struct bpf_map *map;
271 	struct fd f;
272 	void *key;
273 	int err;
274 
275 	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
276 		return -EINVAL;
277 
278 	f = fdget(ufd);
279 	map = bpf_map_get(f);
280 	if (IS_ERR(map))
281 		return PTR_ERR(map);
282 
283 	err = -ENOMEM;
284 	key = kmalloc(map->key_size, GFP_USER);
285 	if (!key)
286 		goto err_put;
287 
288 	err = -EFAULT;
289 	if (copy_from_user(key, ukey, map->key_size) != 0)
290 		goto free_key;
291 
292 	rcu_read_lock();
293 	err = map->ops->map_delete_elem(map, key);
294 	rcu_read_unlock();
295 
296 free_key:
297 	kfree(key);
298 err_put:
299 	fdput(f);
300 	return err;
301 }
302 
303 /* last field in 'union bpf_attr' used by this command */
304 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
305 
306 static int map_get_next_key(union bpf_attr *attr)
307 {
308 	void __user *ukey = u64_to_ptr(attr->key);
309 	void __user *unext_key = u64_to_ptr(attr->next_key);
310 	int ufd = attr->map_fd;
311 	struct bpf_map *map;
312 	void *key, *next_key;
313 	struct fd f;
314 	int err;
315 
316 	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
317 		return -EINVAL;
318 
319 	f = fdget(ufd);
320 	map = bpf_map_get(f);
321 	if (IS_ERR(map))
322 		return PTR_ERR(map);
323 
324 	err = -ENOMEM;
325 	key = kmalloc(map->key_size, GFP_USER);
326 	if (!key)
327 		goto err_put;
328 
329 	err = -EFAULT;
330 	if (copy_from_user(key, ukey, map->key_size) != 0)
331 		goto free_key;
332 
333 	err = -ENOMEM;
334 	next_key = kmalloc(map->key_size, GFP_USER);
335 	if (!next_key)
336 		goto free_key;
337 
338 	rcu_read_lock();
339 	err = map->ops->map_get_next_key(map, key, next_key);
340 	rcu_read_unlock();
341 	if (err)
342 		goto free_next_key;
343 
344 	err = -EFAULT;
345 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
346 		goto free_next_key;
347 
348 	err = 0;
349 
350 free_next_key:
351 	kfree(next_key);
352 free_key:
353 	kfree(key);
354 err_put:
355 	fdput(f);
356 	return err;
357 }
358 
359 static LIST_HEAD(bpf_prog_types);
360 
361 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
362 {
363 	struct bpf_prog_type_list *tl;
364 
365 	list_for_each_entry(tl, &bpf_prog_types, list_node) {
366 		if (tl->type == type) {
367 			prog->aux->ops = tl->ops;
368 			prog->type = type;
369 			return 0;
370 		}
371 	}
372 
373 	return -EINVAL;
374 }
375 
376 void bpf_register_prog_type(struct bpf_prog_type_list *tl)
377 {
378 	list_add(&tl->list_node, &bpf_prog_types);
379 }
380 
381 /* fixup insn->imm field of bpf_call instructions:
382  * if (insn->imm == BPF_FUNC_map_lookup_elem)
383  *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
384  * else if (insn->imm == BPF_FUNC_map_update_elem)
385  *      insn->imm = bpf_map_update_elem - __bpf_call_base;
386  * else ...
387  *
388  * this function is called after eBPF program passed verification
389  */
390 static void fixup_bpf_calls(struct bpf_prog *prog)
391 {
392 	const struct bpf_func_proto *fn;
393 	int i;
394 
395 	for (i = 0; i < prog->len; i++) {
396 		struct bpf_insn *insn = &prog->insnsi[i];
397 
398 		if (insn->code == (BPF_JMP | BPF_CALL)) {
399 			/* we reach here when program has bpf_call instructions
400 			 * and it passed bpf_check(), means that
401 			 * ops->get_func_proto must have been supplied, check it
402 			 */
403 			BUG_ON(!prog->aux->ops->get_func_proto);
404 
405 			if (insn->imm == BPF_FUNC_tail_call) {
406 				/* mark bpf_tail_call as different opcode
407 				 * to avoid conditional branch in
408 				 * interpeter for every normal call
409 				 * and to prevent accidental JITing by
410 				 * JIT compiler that doesn't support
411 				 * bpf_tail_call yet
412 				 */
413 				insn->imm = 0;
414 				insn->code |= BPF_X;
415 				continue;
416 			}
417 
418 			fn = prog->aux->ops->get_func_proto(insn->imm);
419 			/* all functions that have prototype and verifier allowed
420 			 * programs to call them, must be real in-kernel functions
421 			 */
422 			BUG_ON(!fn->func);
423 			insn->imm = fn->func - __bpf_call_base;
424 		}
425 	}
426 }
427 
428 /* drop refcnt on maps used by eBPF program and free auxilary data */
429 static void free_used_maps(struct bpf_prog_aux *aux)
430 {
431 	int i;
432 
433 	for (i = 0; i < aux->used_map_cnt; i++)
434 		bpf_map_put(aux->used_maps[i]);
435 
436 	kfree(aux->used_maps);
437 }
438 
439 static void __prog_put_rcu(struct rcu_head *rcu)
440 {
441 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
442 
443 	free_used_maps(aux);
444 	bpf_prog_free(aux->prog);
445 }
446 
447 /* version of bpf_prog_put() that is called after a grace period */
448 void bpf_prog_put_rcu(struct bpf_prog *prog)
449 {
450 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
451 		prog->aux->prog = prog;
452 		call_rcu(&prog->aux->rcu, __prog_put_rcu);
453 	}
454 }
455 
456 void bpf_prog_put(struct bpf_prog *prog)
457 {
458 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
459 		free_used_maps(prog->aux);
460 		bpf_prog_free(prog);
461 	}
462 }
463 EXPORT_SYMBOL_GPL(bpf_prog_put);
464 
465 static int bpf_prog_release(struct inode *inode, struct file *filp)
466 {
467 	struct bpf_prog *prog = filp->private_data;
468 
469 	bpf_prog_put_rcu(prog);
470 	return 0;
471 }
472 
473 static const struct file_operations bpf_prog_fops = {
474         .release = bpf_prog_release,
475 };
476 
477 static struct bpf_prog *get_prog(struct fd f)
478 {
479 	struct bpf_prog *prog;
480 
481 	if (!f.file)
482 		return ERR_PTR(-EBADF);
483 
484 	if (f.file->f_op != &bpf_prog_fops) {
485 		fdput(f);
486 		return ERR_PTR(-EINVAL);
487 	}
488 
489 	prog = f.file->private_data;
490 
491 	return prog;
492 }
493 
494 /* called by sockets/tracing/seccomp before attaching program to an event
495  * pairs with bpf_prog_put()
496  */
497 struct bpf_prog *bpf_prog_get(u32 ufd)
498 {
499 	struct fd f = fdget(ufd);
500 	struct bpf_prog *prog;
501 
502 	prog = get_prog(f);
503 
504 	if (IS_ERR(prog))
505 		return prog;
506 
507 	atomic_inc(&prog->aux->refcnt);
508 	fdput(f);
509 	return prog;
510 }
511 EXPORT_SYMBOL_GPL(bpf_prog_get);
512 
513 /* last field in 'union bpf_attr' used by this command */
514 #define	BPF_PROG_LOAD_LAST_FIELD kern_version
515 
516 static int bpf_prog_load(union bpf_attr *attr)
517 {
518 	enum bpf_prog_type type = attr->prog_type;
519 	struct bpf_prog *prog;
520 	int err;
521 	char license[128];
522 	bool is_gpl;
523 
524 	if (CHECK_ATTR(BPF_PROG_LOAD))
525 		return -EINVAL;
526 
527 	/* copy eBPF program license from user space */
528 	if (strncpy_from_user(license, u64_to_ptr(attr->license),
529 			      sizeof(license) - 1) < 0)
530 		return -EFAULT;
531 	license[sizeof(license) - 1] = 0;
532 
533 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
534 	is_gpl = license_is_gpl_compatible(license);
535 
536 	if (attr->insn_cnt >= BPF_MAXINSNS)
537 		return -EINVAL;
538 
539 	if (type == BPF_PROG_TYPE_KPROBE &&
540 	    attr->kern_version != LINUX_VERSION_CODE)
541 		return -EINVAL;
542 
543 	/* plain bpf_prog allocation */
544 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
545 	if (!prog)
546 		return -ENOMEM;
547 
548 	prog->len = attr->insn_cnt;
549 
550 	err = -EFAULT;
551 	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
552 			   prog->len * sizeof(struct bpf_insn)) != 0)
553 		goto free_prog;
554 
555 	prog->orig_prog = NULL;
556 	prog->jited = false;
557 
558 	atomic_set(&prog->aux->refcnt, 1);
559 	prog->gpl_compatible = is_gpl;
560 
561 	/* find program type: socket_filter vs tracing_filter */
562 	err = find_prog_type(type, prog);
563 	if (err < 0)
564 		goto free_prog;
565 
566 	/* run eBPF verifier */
567 	err = bpf_check(&prog, attr);
568 	if (err < 0)
569 		goto free_used_maps;
570 
571 	/* fixup BPF_CALL->imm field */
572 	fixup_bpf_calls(prog);
573 
574 	/* eBPF program is ready to be JITed */
575 	err = bpf_prog_select_runtime(prog);
576 	if (err < 0)
577 		goto free_used_maps;
578 
579 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
580 	if (err < 0)
581 		/* failed to allocate fd */
582 		goto free_used_maps;
583 
584 	return err;
585 
586 free_used_maps:
587 	free_used_maps(prog->aux);
588 free_prog:
589 	bpf_prog_free(prog);
590 	return err;
591 }
592 
593 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
594 {
595 	union bpf_attr attr = {};
596 	int err;
597 
598 	/* the syscall is limited to root temporarily. This restriction will be
599 	 * lifted when security audit is clean. Note that eBPF+tracing must have
600 	 * this restriction, since it may pass kernel data to user space
601 	 */
602 	if (!capable(CAP_SYS_ADMIN))
603 		return -EPERM;
604 
605 	if (!access_ok(VERIFY_READ, uattr, 1))
606 		return -EFAULT;
607 
608 	if (size > PAGE_SIZE)	/* silly large */
609 		return -E2BIG;
610 
611 	/* If we're handed a bigger struct than we know of,
612 	 * ensure all the unknown bits are 0 - i.e. new
613 	 * user-space does not rely on any kernel feature
614 	 * extensions we dont know about yet.
615 	 */
616 	if (size > sizeof(attr)) {
617 		unsigned char __user *addr;
618 		unsigned char __user *end;
619 		unsigned char val;
620 
621 		addr = (void __user *)uattr + sizeof(attr);
622 		end  = (void __user *)uattr + size;
623 
624 		for (; addr < end; addr++) {
625 			err = get_user(val, addr);
626 			if (err)
627 				return err;
628 			if (val)
629 				return -E2BIG;
630 		}
631 		size = sizeof(attr);
632 	}
633 
634 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
635 	if (copy_from_user(&attr, uattr, size) != 0)
636 		return -EFAULT;
637 
638 	switch (cmd) {
639 	case BPF_MAP_CREATE:
640 		err = map_create(&attr);
641 		break;
642 	case BPF_MAP_LOOKUP_ELEM:
643 		err = map_lookup_elem(&attr);
644 		break;
645 	case BPF_MAP_UPDATE_ELEM:
646 		err = map_update_elem(&attr);
647 		break;
648 	case BPF_MAP_DELETE_ELEM:
649 		err = map_delete_elem(&attr);
650 		break;
651 	case BPF_MAP_GET_NEXT_KEY:
652 		err = map_get_next_key(&attr);
653 		break;
654 	case BPF_PROG_LOAD:
655 		err = bpf_prog_load(&attr);
656 		break;
657 	default:
658 		err = -EINVAL;
659 		break;
660 	}
661 
662 	return err;
663 }
664