1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/anon_inodes.h> 16 #include <linux/file.h> 17 #include <linux/license.h> 18 #include <linux/filter.h> 19 #include <linux/version.h> 20 21 static LIST_HEAD(bpf_map_types); 22 23 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 24 { 25 struct bpf_map_type_list *tl; 26 struct bpf_map *map; 27 28 list_for_each_entry(tl, &bpf_map_types, list_node) { 29 if (tl->type == attr->map_type) { 30 map = tl->ops->map_alloc(attr); 31 if (IS_ERR(map)) 32 return map; 33 map->ops = tl->ops; 34 map->map_type = attr->map_type; 35 return map; 36 } 37 } 38 return ERR_PTR(-EINVAL); 39 } 40 41 /* boot time registration of different map implementations */ 42 void bpf_register_map_type(struct bpf_map_type_list *tl) 43 { 44 list_add(&tl->list_node, &bpf_map_types); 45 } 46 47 /* called from workqueue */ 48 static void bpf_map_free_deferred(struct work_struct *work) 49 { 50 struct bpf_map *map = container_of(work, struct bpf_map, work); 51 52 /* implementation dependent freeing */ 53 map->ops->map_free(map); 54 } 55 56 /* decrement map refcnt and schedule it for freeing via workqueue 57 * (unrelying map implementation ops->map_free() might sleep) 58 */ 59 void bpf_map_put(struct bpf_map *map) 60 { 61 if (atomic_dec_and_test(&map->refcnt)) { 62 INIT_WORK(&map->work, bpf_map_free_deferred); 63 schedule_work(&map->work); 64 } 65 } 66 67 static int bpf_map_release(struct inode *inode, struct file *filp) 68 { 69 struct bpf_map *map = filp->private_data; 70 71 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) 72 /* prog_array stores refcnt-ed bpf_prog pointers 73 * release them all when user space closes prog_array_fd 74 */ 75 bpf_fd_array_map_clear(map); 76 77 bpf_map_put(map); 78 return 0; 79 } 80 81 static const struct file_operations bpf_map_fops = { 82 .release = bpf_map_release, 83 }; 84 85 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 86 #define CHECK_ATTR(CMD) \ 87 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 88 sizeof(attr->CMD##_LAST_FIELD), 0, \ 89 sizeof(*attr) - \ 90 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 91 sizeof(attr->CMD##_LAST_FIELD)) != NULL 92 93 #define BPF_MAP_CREATE_LAST_FIELD max_entries 94 /* called via syscall */ 95 static int map_create(union bpf_attr *attr) 96 { 97 struct bpf_map *map; 98 int err; 99 100 err = CHECK_ATTR(BPF_MAP_CREATE); 101 if (err) 102 return -EINVAL; 103 104 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 105 map = find_and_alloc_map(attr); 106 if (IS_ERR(map)) 107 return PTR_ERR(map); 108 109 atomic_set(&map->refcnt, 1); 110 111 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); 112 113 if (err < 0) 114 /* failed to allocate fd */ 115 goto free_map; 116 117 return err; 118 119 free_map: 120 map->ops->map_free(map); 121 return err; 122 } 123 124 /* if error is returned, fd is released. 125 * On success caller should complete fd access with matching fdput() 126 */ 127 struct bpf_map *bpf_map_get(struct fd f) 128 { 129 struct bpf_map *map; 130 131 if (!f.file) 132 return ERR_PTR(-EBADF); 133 134 if (f.file->f_op != &bpf_map_fops) { 135 fdput(f); 136 return ERR_PTR(-EINVAL); 137 } 138 139 map = f.file->private_data; 140 141 return map; 142 } 143 144 /* helper to convert user pointers passed inside __aligned_u64 fields */ 145 static void __user *u64_to_ptr(__u64 val) 146 { 147 return (void __user *) (unsigned long) val; 148 } 149 150 /* last field in 'union bpf_attr' used by this command */ 151 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value 152 153 static int map_lookup_elem(union bpf_attr *attr) 154 { 155 void __user *ukey = u64_to_ptr(attr->key); 156 void __user *uvalue = u64_to_ptr(attr->value); 157 int ufd = attr->map_fd; 158 struct bpf_map *map; 159 void *key, *value, *ptr; 160 struct fd f; 161 int err; 162 163 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 164 return -EINVAL; 165 166 f = fdget(ufd); 167 map = bpf_map_get(f); 168 if (IS_ERR(map)) 169 return PTR_ERR(map); 170 171 err = -ENOMEM; 172 key = kmalloc(map->key_size, GFP_USER); 173 if (!key) 174 goto err_put; 175 176 err = -EFAULT; 177 if (copy_from_user(key, ukey, map->key_size) != 0) 178 goto free_key; 179 180 err = -ENOMEM; 181 value = kmalloc(map->value_size, GFP_USER); 182 if (!value) 183 goto free_key; 184 185 rcu_read_lock(); 186 ptr = map->ops->map_lookup_elem(map, key); 187 if (ptr) 188 memcpy(value, ptr, map->value_size); 189 rcu_read_unlock(); 190 191 err = -ENOENT; 192 if (!ptr) 193 goto free_value; 194 195 err = -EFAULT; 196 if (copy_to_user(uvalue, value, map->value_size) != 0) 197 goto free_value; 198 199 err = 0; 200 201 free_value: 202 kfree(value); 203 free_key: 204 kfree(key); 205 err_put: 206 fdput(f); 207 return err; 208 } 209 210 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 211 212 static int map_update_elem(union bpf_attr *attr) 213 { 214 void __user *ukey = u64_to_ptr(attr->key); 215 void __user *uvalue = u64_to_ptr(attr->value); 216 int ufd = attr->map_fd; 217 struct bpf_map *map; 218 void *key, *value; 219 struct fd f; 220 int err; 221 222 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 223 return -EINVAL; 224 225 f = fdget(ufd); 226 map = bpf_map_get(f); 227 if (IS_ERR(map)) 228 return PTR_ERR(map); 229 230 err = -ENOMEM; 231 key = kmalloc(map->key_size, GFP_USER); 232 if (!key) 233 goto err_put; 234 235 err = -EFAULT; 236 if (copy_from_user(key, ukey, map->key_size) != 0) 237 goto free_key; 238 239 err = -ENOMEM; 240 value = kmalloc(map->value_size, GFP_USER); 241 if (!value) 242 goto free_key; 243 244 err = -EFAULT; 245 if (copy_from_user(value, uvalue, map->value_size) != 0) 246 goto free_value; 247 248 /* eBPF program that use maps are running under rcu_read_lock(), 249 * therefore all map accessors rely on this fact, so do the same here 250 */ 251 rcu_read_lock(); 252 err = map->ops->map_update_elem(map, key, value, attr->flags); 253 rcu_read_unlock(); 254 255 free_value: 256 kfree(value); 257 free_key: 258 kfree(key); 259 err_put: 260 fdput(f); 261 return err; 262 } 263 264 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 265 266 static int map_delete_elem(union bpf_attr *attr) 267 { 268 void __user *ukey = u64_to_ptr(attr->key); 269 int ufd = attr->map_fd; 270 struct bpf_map *map; 271 struct fd f; 272 void *key; 273 int err; 274 275 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 276 return -EINVAL; 277 278 f = fdget(ufd); 279 map = bpf_map_get(f); 280 if (IS_ERR(map)) 281 return PTR_ERR(map); 282 283 err = -ENOMEM; 284 key = kmalloc(map->key_size, GFP_USER); 285 if (!key) 286 goto err_put; 287 288 err = -EFAULT; 289 if (copy_from_user(key, ukey, map->key_size) != 0) 290 goto free_key; 291 292 rcu_read_lock(); 293 err = map->ops->map_delete_elem(map, key); 294 rcu_read_unlock(); 295 296 free_key: 297 kfree(key); 298 err_put: 299 fdput(f); 300 return err; 301 } 302 303 /* last field in 'union bpf_attr' used by this command */ 304 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 305 306 static int map_get_next_key(union bpf_attr *attr) 307 { 308 void __user *ukey = u64_to_ptr(attr->key); 309 void __user *unext_key = u64_to_ptr(attr->next_key); 310 int ufd = attr->map_fd; 311 struct bpf_map *map; 312 void *key, *next_key; 313 struct fd f; 314 int err; 315 316 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 317 return -EINVAL; 318 319 f = fdget(ufd); 320 map = bpf_map_get(f); 321 if (IS_ERR(map)) 322 return PTR_ERR(map); 323 324 err = -ENOMEM; 325 key = kmalloc(map->key_size, GFP_USER); 326 if (!key) 327 goto err_put; 328 329 err = -EFAULT; 330 if (copy_from_user(key, ukey, map->key_size) != 0) 331 goto free_key; 332 333 err = -ENOMEM; 334 next_key = kmalloc(map->key_size, GFP_USER); 335 if (!next_key) 336 goto free_key; 337 338 rcu_read_lock(); 339 err = map->ops->map_get_next_key(map, key, next_key); 340 rcu_read_unlock(); 341 if (err) 342 goto free_next_key; 343 344 err = -EFAULT; 345 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 346 goto free_next_key; 347 348 err = 0; 349 350 free_next_key: 351 kfree(next_key); 352 free_key: 353 kfree(key); 354 err_put: 355 fdput(f); 356 return err; 357 } 358 359 static LIST_HEAD(bpf_prog_types); 360 361 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 362 { 363 struct bpf_prog_type_list *tl; 364 365 list_for_each_entry(tl, &bpf_prog_types, list_node) { 366 if (tl->type == type) { 367 prog->aux->ops = tl->ops; 368 prog->type = type; 369 return 0; 370 } 371 } 372 373 return -EINVAL; 374 } 375 376 void bpf_register_prog_type(struct bpf_prog_type_list *tl) 377 { 378 list_add(&tl->list_node, &bpf_prog_types); 379 } 380 381 /* fixup insn->imm field of bpf_call instructions: 382 * if (insn->imm == BPF_FUNC_map_lookup_elem) 383 * insn->imm = bpf_map_lookup_elem - __bpf_call_base; 384 * else if (insn->imm == BPF_FUNC_map_update_elem) 385 * insn->imm = bpf_map_update_elem - __bpf_call_base; 386 * else ... 387 * 388 * this function is called after eBPF program passed verification 389 */ 390 static void fixup_bpf_calls(struct bpf_prog *prog) 391 { 392 const struct bpf_func_proto *fn; 393 int i; 394 395 for (i = 0; i < prog->len; i++) { 396 struct bpf_insn *insn = &prog->insnsi[i]; 397 398 if (insn->code == (BPF_JMP | BPF_CALL)) { 399 /* we reach here when program has bpf_call instructions 400 * and it passed bpf_check(), means that 401 * ops->get_func_proto must have been supplied, check it 402 */ 403 BUG_ON(!prog->aux->ops->get_func_proto); 404 405 if (insn->imm == BPF_FUNC_get_route_realm) 406 prog->dst_needed = 1; 407 if (insn->imm == BPF_FUNC_tail_call) { 408 /* mark bpf_tail_call as different opcode 409 * to avoid conditional branch in 410 * interpeter for every normal call 411 * and to prevent accidental JITing by 412 * JIT compiler that doesn't support 413 * bpf_tail_call yet 414 */ 415 insn->imm = 0; 416 insn->code |= BPF_X; 417 continue; 418 } 419 420 fn = prog->aux->ops->get_func_proto(insn->imm); 421 /* all functions that have prototype and verifier allowed 422 * programs to call them, must be real in-kernel functions 423 */ 424 BUG_ON(!fn->func); 425 insn->imm = fn->func - __bpf_call_base; 426 } 427 } 428 } 429 430 /* drop refcnt on maps used by eBPF program and free auxilary data */ 431 static void free_used_maps(struct bpf_prog_aux *aux) 432 { 433 int i; 434 435 for (i = 0; i < aux->used_map_cnt; i++) 436 bpf_map_put(aux->used_maps[i]); 437 438 kfree(aux->used_maps); 439 } 440 441 static void __prog_put_rcu(struct rcu_head *rcu) 442 { 443 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 444 445 free_used_maps(aux); 446 bpf_prog_free(aux->prog); 447 } 448 449 /* version of bpf_prog_put() that is called after a grace period */ 450 void bpf_prog_put_rcu(struct bpf_prog *prog) 451 { 452 if (atomic_dec_and_test(&prog->aux->refcnt)) { 453 prog->aux->prog = prog; 454 call_rcu(&prog->aux->rcu, __prog_put_rcu); 455 } 456 } 457 458 void bpf_prog_put(struct bpf_prog *prog) 459 { 460 if (atomic_dec_and_test(&prog->aux->refcnt)) { 461 free_used_maps(prog->aux); 462 bpf_prog_free(prog); 463 } 464 } 465 EXPORT_SYMBOL_GPL(bpf_prog_put); 466 467 static int bpf_prog_release(struct inode *inode, struct file *filp) 468 { 469 struct bpf_prog *prog = filp->private_data; 470 471 bpf_prog_put_rcu(prog); 472 return 0; 473 } 474 475 static const struct file_operations bpf_prog_fops = { 476 .release = bpf_prog_release, 477 }; 478 479 static struct bpf_prog *get_prog(struct fd f) 480 { 481 struct bpf_prog *prog; 482 483 if (!f.file) 484 return ERR_PTR(-EBADF); 485 486 if (f.file->f_op != &bpf_prog_fops) { 487 fdput(f); 488 return ERR_PTR(-EINVAL); 489 } 490 491 prog = f.file->private_data; 492 493 return prog; 494 } 495 496 /* called by sockets/tracing/seccomp before attaching program to an event 497 * pairs with bpf_prog_put() 498 */ 499 struct bpf_prog *bpf_prog_get(u32 ufd) 500 { 501 struct fd f = fdget(ufd); 502 struct bpf_prog *prog; 503 504 prog = get_prog(f); 505 506 if (IS_ERR(prog)) 507 return prog; 508 509 atomic_inc(&prog->aux->refcnt); 510 fdput(f); 511 return prog; 512 } 513 EXPORT_SYMBOL_GPL(bpf_prog_get); 514 515 /* last field in 'union bpf_attr' used by this command */ 516 #define BPF_PROG_LOAD_LAST_FIELD kern_version 517 518 static int bpf_prog_load(union bpf_attr *attr) 519 { 520 enum bpf_prog_type type = attr->prog_type; 521 struct bpf_prog *prog; 522 int err; 523 char license[128]; 524 bool is_gpl; 525 526 if (CHECK_ATTR(BPF_PROG_LOAD)) 527 return -EINVAL; 528 529 /* copy eBPF program license from user space */ 530 if (strncpy_from_user(license, u64_to_ptr(attr->license), 531 sizeof(license) - 1) < 0) 532 return -EFAULT; 533 license[sizeof(license) - 1] = 0; 534 535 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 536 is_gpl = license_is_gpl_compatible(license); 537 538 if (attr->insn_cnt >= BPF_MAXINSNS) 539 return -EINVAL; 540 541 if (type == BPF_PROG_TYPE_KPROBE && 542 attr->kern_version != LINUX_VERSION_CODE) 543 return -EINVAL; 544 545 /* plain bpf_prog allocation */ 546 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 547 if (!prog) 548 return -ENOMEM; 549 550 prog->len = attr->insn_cnt; 551 552 err = -EFAULT; 553 if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), 554 prog->len * sizeof(struct bpf_insn)) != 0) 555 goto free_prog; 556 557 prog->orig_prog = NULL; 558 prog->jited = 0; 559 560 atomic_set(&prog->aux->refcnt, 1); 561 prog->gpl_compatible = is_gpl ? 1 : 0; 562 563 /* find program type: socket_filter vs tracing_filter */ 564 err = find_prog_type(type, prog); 565 if (err < 0) 566 goto free_prog; 567 568 /* run eBPF verifier */ 569 err = bpf_check(&prog, attr); 570 if (err < 0) 571 goto free_used_maps; 572 573 /* fixup BPF_CALL->imm field */ 574 fixup_bpf_calls(prog); 575 576 /* eBPF program is ready to be JITed */ 577 err = bpf_prog_select_runtime(prog); 578 if (err < 0) 579 goto free_used_maps; 580 581 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); 582 if (err < 0) 583 /* failed to allocate fd */ 584 goto free_used_maps; 585 586 return err; 587 588 free_used_maps: 589 free_used_maps(prog->aux); 590 free_prog: 591 bpf_prog_free(prog); 592 return err; 593 } 594 595 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 596 { 597 union bpf_attr attr = {}; 598 int err; 599 600 /* the syscall is limited to root temporarily. This restriction will be 601 * lifted when security audit is clean. Note that eBPF+tracing must have 602 * this restriction, since it may pass kernel data to user space 603 */ 604 if (!capable(CAP_SYS_ADMIN)) 605 return -EPERM; 606 607 if (!access_ok(VERIFY_READ, uattr, 1)) 608 return -EFAULT; 609 610 if (size > PAGE_SIZE) /* silly large */ 611 return -E2BIG; 612 613 /* If we're handed a bigger struct than we know of, 614 * ensure all the unknown bits are 0 - i.e. new 615 * user-space does not rely on any kernel feature 616 * extensions we dont know about yet. 617 */ 618 if (size > sizeof(attr)) { 619 unsigned char __user *addr; 620 unsigned char __user *end; 621 unsigned char val; 622 623 addr = (void __user *)uattr + sizeof(attr); 624 end = (void __user *)uattr + size; 625 626 for (; addr < end; addr++) { 627 err = get_user(val, addr); 628 if (err) 629 return err; 630 if (val) 631 return -E2BIG; 632 } 633 size = sizeof(attr); 634 } 635 636 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 637 if (copy_from_user(&attr, uattr, size) != 0) 638 return -EFAULT; 639 640 switch (cmd) { 641 case BPF_MAP_CREATE: 642 err = map_create(&attr); 643 break; 644 case BPF_MAP_LOOKUP_ELEM: 645 err = map_lookup_elem(&attr); 646 break; 647 case BPF_MAP_UPDATE_ELEM: 648 err = map_update_elem(&attr); 649 break; 650 case BPF_MAP_DELETE_ELEM: 651 err = map_delete_elem(&attr); 652 break; 653 case BPF_MAP_GET_NEXT_KEY: 654 err = map_get_next_key(&attr); 655 break; 656 case BPF_PROG_LOAD: 657 err = bpf_prog_load(&attr); 658 break; 659 default: 660 err = -EINVAL; 661 break; 662 } 663 664 return err; 665 } 666