1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/anon_inodes.h> 16 #include <linux/file.h> 17 #include <linux/license.h> 18 #include <linux/filter.h> 19 #include <linux/version.h> 20 21 static LIST_HEAD(bpf_map_types); 22 23 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 24 { 25 struct bpf_map_type_list *tl; 26 struct bpf_map *map; 27 28 list_for_each_entry(tl, &bpf_map_types, list_node) { 29 if (tl->type == attr->map_type) { 30 map = tl->ops->map_alloc(attr); 31 if (IS_ERR(map)) 32 return map; 33 map->ops = tl->ops; 34 map->map_type = attr->map_type; 35 return map; 36 } 37 } 38 return ERR_PTR(-EINVAL); 39 } 40 41 /* boot time registration of different map implementations */ 42 void bpf_register_map_type(struct bpf_map_type_list *tl) 43 { 44 list_add(&tl->list_node, &bpf_map_types); 45 } 46 47 /* called from workqueue */ 48 static void bpf_map_free_deferred(struct work_struct *work) 49 { 50 struct bpf_map *map = container_of(work, struct bpf_map, work); 51 52 /* implementation dependent freeing */ 53 map->ops->map_free(map); 54 } 55 56 /* decrement map refcnt and schedule it for freeing via workqueue 57 * (unrelying map implementation ops->map_free() might sleep) 58 */ 59 void bpf_map_put(struct bpf_map *map) 60 { 61 if (atomic_dec_and_test(&map->refcnt)) { 62 INIT_WORK(&map->work, bpf_map_free_deferred); 63 schedule_work(&map->work); 64 } 65 } 66 67 static int bpf_map_release(struct inode *inode, struct file *filp) 68 { 69 struct bpf_map *map = filp->private_data; 70 71 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) 72 /* prog_array stores refcnt-ed bpf_prog pointers 73 * release them all when user space closes prog_array_fd 74 */ 75 bpf_fd_array_map_clear(map); 76 77 bpf_map_put(map); 78 return 0; 79 } 80 81 static const struct file_operations bpf_map_fops = { 82 .release = bpf_map_release, 83 }; 84 85 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 86 #define CHECK_ATTR(CMD) \ 87 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 88 sizeof(attr->CMD##_LAST_FIELD), 0, \ 89 sizeof(*attr) - \ 90 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 91 sizeof(attr->CMD##_LAST_FIELD)) != NULL 92 93 #define BPF_MAP_CREATE_LAST_FIELD max_entries 94 /* called via syscall */ 95 static int map_create(union bpf_attr *attr) 96 { 97 struct bpf_map *map; 98 int err; 99 100 err = CHECK_ATTR(BPF_MAP_CREATE); 101 if (err) 102 return -EINVAL; 103 104 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 105 map = find_and_alloc_map(attr); 106 if (IS_ERR(map)) 107 return PTR_ERR(map); 108 109 atomic_set(&map->refcnt, 1); 110 111 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); 112 113 if (err < 0) 114 /* failed to allocate fd */ 115 goto free_map; 116 117 return err; 118 119 free_map: 120 map->ops->map_free(map); 121 return err; 122 } 123 124 /* if error is returned, fd is released. 125 * On success caller should complete fd access with matching fdput() 126 */ 127 struct bpf_map *bpf_map_get(struct fd f) 128 { 129 struct bpf_map *map; 130 131 if (!f.file) 132 return ERR_PTR(-EBADF); 133 134 if (f.file->f_op != &bpf_map_fops) { 135 fdput(f); 136 return ERR_PTR(-EINVAL); 137 } 138 139 map = f.file->private_data; 140 141 return map; 142 } 143 144 /* helper to convert user pointers passed inside __aligned_u64 fields */ 145 static void __user *u64_to_ptr(__u64 val) 146 { 147 return (void __user *) (unsigned long) val; 148 } 149 150 /* last field in 'union bpf_attr' used by this command */ 151 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value 152 153 static int map_lookup_elem(union bpf_attr *attr) 154 { 155 void __user *ukey = u64_to_ptr(attr->key); 156 void __user *uvalue = u64_to_ptr(attr->value); 157 int ufd = attr->map_fd; 158 struct bpf_map *map; 159 void *key, *value, *ptr; 160 struct fd f; 161 int err; 162 163 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 164 return -EINVAL; 165 166 f = fdget(ufd); 167 map = bpf_map_get(f); 168 if (IS_ERR(map)) 169 return PTR_ERR(map); 170 171 err = -ENOMEM; 172 key = kmalloc(map->key_size, GFP_USER); 173 if (!key) 174 goto err_put; 175 176 err = -EFAULT; 177 if (copy_from_user(key, ukey, map->key_size) != 0) 178 goto free_key; 179 180 err = -ENOMEM; 181 value = kmalloc(map->value_size, GFP_USER); 182 if (!value) 183 goto free_key; 184 185 rcu_read_lock(); 186 ptr = map->ops->map_lookup_elem(map, key); 187 if (ptr) 188 memcpy(value, ptr, map->value_size); 189 rcu_read_unlock(); 190 191 err = -ENOENT; 192 if (!ptr) 193 goto free_value; 194 195 err = -EFAULT; 196 if (copy_to_user(uvalue, value, map->value_size) != 0) 197 goto free_value; 198 199 err = 0; 200 201 free_value: 202 kfree(value); 203 free_key: 204 kfree(key); 205 err_put: 206 fdput(f); 207 return err; 208 } 209 210 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 211 212 static int map_update_elem(union bpf_attr *attr) 213 { 214 void __user *ukey = u64_to_ptr(attr->key); 215 void __user *uvalue = u64_to_ptr(attr->value); 216 int ufd = attr->map_fd; 217 struct bpf_map *map; 218 void *key, *value; 219 struct fd f; 220 int err; 221 222 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 223 return -EINVAL; 224 225 f = fdget(ufd); 226 map = bpf_map_get(f); 227 if (IS_ERR(map)) 228 return PTR_ERR(map); 229 230 err = -ENOMEM; 231 key = kmalloc(map->key_size, GFP_USER); 232 if (!key) 233 goto err_put; 234 235 err = -EFAULT; 236 if (copy_from_user(key, ukey, map->key_size) != 0) 237 goto free_key; 238 239 err = -ENOMEM; 240 value = kmalloc(map->value_size, GFP_USER); 241 if (!value) 242 goto free_key; 243 244 err = -EFAULT; 245 if (copy_from_user(value, uvalue, map->value_size) != 0) 246 goto free_value; 247 248 /* eBPF program that use maps are running under rcu_read_lock(), 249 * therefore all map accessors rely on this fact, so do the same here 250 */ 251 rcu_read_lock(); 252 err = map->ops->map_update_elem(map, key, value, attr->flags); 253 rcu_read_unlock(); 254 255 free_value: 256 kfree(value); 257 free_key: 258 kfree(key); 259 err_put: 260 fdput(f); 261 return err; 262 } 263 264 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 265 266 static int map_delete_elem(union bpf_attr *attr) 267 { 268 void __user *ukey = u64_to_ptr(attr->key); 269 int ufd = attr->map_fd; 270 struct bpf_map *map; 271 struct fd f; 272 void *key; 273 int err; 274 275 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 276 return -EINVAL; 277 278 f = fdget(ufd); 279 map = bpf_map_get(f); 280 if (IS_ERR(map)) 281 return PTR_ERR(map); 282 283 err = -ENOMEM; 284 key = kmalloc(map->key_size, GFP_USER); 285 if (!key) 286 goto err_put; 287 288 err = -EFAULT; 289 if (copy_from_user(key, ukey, map->key_size) != 0) 290 goto free_key; 291 292 rcu_read_lock(); 293 err = map->ops->map_delete_elem(map, key); 294 rcu_read_unlock(); 295 296 free_key: 297 kfree(key); 298 err_put: 299 fdput(f); 300 return err; 301 } 302 303 /* last field in 'union bpf_attr' used by this command */ 304 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 305 306 static int map_get_next_key(union bpf_attr *attr) 307 { 308 void __user *ukey = u64_to_ptr(attr->key); 309 void __user *unext_key = u64_to_ptr(attr->next_key); 310 int ufd = attr->map_fd; 311 struct bpf_map *map; 312 void *key, *next_key; 313 struct fd f; 314 int err; 315 316 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 317 return -EINVAL; 318 319 f = fdget(ufd); 320 map = bpf_map_get(f); 321 if (IS_ERR(map)) 322 return PTR_ERR(map); 323 324 err = -ENOMEM; 325 key = kmalloc(map->key_size, GFP_USER); 326 if (!key) 327 goto err_put; 328 329 err = -EFAULT; 330 if (copy_from_user(key, ukey, map->key_size) != 0) 331 goto free_key; 332 333 err = -ENOMEM; 334 next_key = kmalloc(map->key_size, GFP_USER); 335 if (!next_key) 336 goto free_key; 337 338 rcu_read_lock(); 339 err = map->ops->map_get_next_key(map, key, next_key); 340 rcu_read_unlock(); 341 if (err) 342 goto free_next_key; 343 344 err = -EFAULT; 345 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 346 goto free_next_key; 347 348 err = 0; 349 350 free_next_key: 351 kfree(next_key); 352 free_key: 353 kfree(key); 354 err_put: 355 fdput(f); 356 return err; 357 } 358 359 static LIST_HEAD(bpf_prog_types); 360 361 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 362 { 363 struct bpf_prog_type_list *tl; 364 365 list_for_each_entry(tl, &bpf_prog_types, list_node) { 366 if (tl->type == type) { 367 prog->aux->ops = tl->ops; 368 prog->type = type; 369 return 0; 370 } 371 } 372 373 return -EINVAL; 374 } 375 376 void bpf_register_prog_type(struct bpf_prog_type_list *tl) 377 { 378 list_add(&tl->list_node, &bpf_prog_types); 379 } 380 381 /* fixup insn->imm field of bpf_call instructions: 382 * if (insn->imm == BPF_FUNC_map_lookup_elem) 383 * insn->imm = bpf_map_lookup_elem - __bpf_call_base; 384 * else if (insn->imm == BPF_FUNC_map_update_elem) 385 * insn->imm = bpf_map_update_elem - __bpf_call_base; 386 * else ... 387 * 388 * this function is called after eBPF program passed verification 389 */ 390 static void fixup_bpf_calls(struct bpf_prog *prog) 391 { 392 const struct bpf_func_proto *fn; 393 int i; 394 395 for (i = 0; i < prog->len; i++) { 396 struct bpf_insn *insn = &prog->insnsi[i]; 397 398 if (insn->code == (BPF_JMP | BPF_CALL)) { 399 /* we reach here when program has bpf_call instructions 400 * and it passed bpf_check(), means that 401 * ops->get_func_proto must have been supplied, check it 402 */ 403 BUG_ON(!prog->aux->ops->get_func_proto); 404 405 if (insn->imm == BPF_FUNC_tail_call) { 406 /* mark bpf_tail_call as different opcode 407 * to avoid conditional branch in 408 * interpeter for every normal call 409 * and to prevent accidental JITing by 410 * JIT compiler that doesn't support 411 * bpf_tail_call yet 412 */ 413 insn->imm = 0; 414 insn->code |= BPF_X; 415 continue; 416 } 417 418 fn = prog->aux->ops->get_func_proto(insn->imm); 419 /* all functions that have prototype and verifier allowed 420 * programs to call them, must be real in-kernel functions 421 */ 422 BUG_ON(!fn->func); 423 insn->imm = fn->func - __bpf_call_base; 424 } 425 } 426 } 427 428 /* drop refcnt on maps used by eBPF program and free auxilary data */ 429 static void free_used_maps(struct bpf_prog_aux *aux) 430 { 431 int i; 432 433 for (i = 0; i < aux->used_map_cnt; i++) 434 bpf_map_put(aux->used_maps[i]); 435 436 kfree(aux->used_maps); 437 } 438 439 static void __prog_put_rcu(struct rcu_head *rcu) 440 { 441 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 442 443 free_used_maps(aux); 444 bpf_prog_free(aux->prog); 445 } 446 447 /* version of bpf_prog_put() that is called after a grace period */ 448 void bpf_prog_put_rcu(struct bpf_prog *prog) 449 { 450 if (atomic_dec_and_test(&prog->aux->refcnt)) { 451 prog->aux->prog = prog; 452 call_rcu(&prog->aux->rcu, __prog_put_rcu); 453 } 454 } 455 456 void bpf_prog_put(struct bpf_prog *prog) 457 { 458 if (atomic_dec_and_test(&prog->aux->refcnt)) { 459 free_used_maps(prog->aux); 460 bpf_prog_free(prog); 461 } 462 } 463 EXPORT_SYMBOL_GPL(bpf_prog_put); 464 465 static int bpf_prog_release(struct inode *inode, struct file *filp) 466 { 467 struct bpf_prog *prog = filp->private_data; 468 469 bpf_prog_put_rcu(prog); 470 return 0; 471 } 472 473 static const struct file_operations bpf_prog_fops = { 474 .release = bpf_prog_release, 475 }; 476 477 static struct bpf_prog *get_prog(struct fd f) 478 { 479 struct bpf_prog *prog; 480 481 if (!f.file) 482 return ERR_PTR(-EBADF); 483 484 if (f.file->f_op != &bpf_prog_fops) { 485 fdput(f); 486 return ERR_PTR(-EINVAL); 487 } 488 489 prog = f.file->private_data; 490 491 return prog; 492 } 493 494 /* called by sockets/tracing/seccomp before attaching program to an event 495 * pairs with bpf_prog_put() 496 */ 497 struct bpf_prog *bpf_prog_get(u32 ufd) 498 { 499 struct fd f = fdget(ufd); 500 struct bpf_prog *prog; 501 502 prog = get_prog(f); 503 504 if (IS_ERR(prog)) 505 return prog; 506 507 atomic_inc(&prog->aux->refcnt); 508 fdput(f); 509 return prog; 510 } 511 EXPORT_SYMBOL_GPL(bpf_prog_get); 512 513 /* last field in 'union bpf_attr' used by this command */ 514 #define BPF_PROG_LOAD_LAST_FIELD kern_version 515 516 static int bpf_prog_load(union bpf_attr *attr) 517 { 518 enum bpf_prog_type type = attr->prog_type; 519 struct bpf_prog *prog; 520 int err; 521 char license[128]; 522 bool is_gpl; 523 524 if (CHECK_ATTR(BPF_PROG_LOAD)) 525 return -EINVAL; 526 527 /* copy eBPF program license from user space */ 528 if (strncpy_from_user(license, u64_to_ptr(attr->license), 529 sizeof(license) - 1) < 0) 530 return -EFAULT; 531 license[sizeof(license) - 1] = 0; 532 533 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 534 is_gpl = license_is_gpl_compatible(license); 535 536 if (attr->insn_cnt >= BPF_MAXINSNS) 537 return -EINVAL; 538 539 if (type == BPF_PROG_TYPE_KPROBE && 540 attr->kern_version != LINUX_VERSION_CODE) 541 return -EINVAL; 542 543 /* plain bpf_prog allocation */ 544 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 545 if (!prog) 546 return -ENOMEM; 547 548 prog->len = attr->insn_cnt; 549 550 err = -EFAULT; 551 if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), 552 prog->len * sizeof(struct bpf_insn)) != 0) 553 goto free_prog; 554 555 prog->orig_prog = NULL; 556 prog->jited = false; 557 558 atomic_set(&prog->aux->refcnt, 1); 559 prog->gpl_compatible = is_gpl; 560 561 /* find program type: socket_filter vs tracing_filter */ 562 err = find_prog_type(type, prog); 563 if (err < 0) 564 goto free_prog; 565 566 /* run eBPF verifier */ 567 err = bpf_check(&prog, attr); 568 if (err < 0) 569 goto free_used_maps; 570 571 /* fixup BPF_CALL->imm field */ 572 fixup_bpf_calls(prog); 573 574 /* eBPF program is ready to be JITed */ 575 err = bpf_prog_select_runtime(prog); 576 if (err < 0) 577 goto free_used_maps; 578 579 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); 580 if (err < 0) 581 /* failed to allocate fd */ 582 goto free_used_maps; 583 584 return err; 585 586 free_used_maps: 587 free_used_maps(prog->aux); 588 free_prog: 589 bpf_prog_free(prog); 590 return err; 591 } 592 593 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 594 { 595 union bpf_attr attr = {}; 596 int err; 597 598 /* the syscall is limited to root temporarily. This restriction will be 599 * lifted when security audit is clean. Note that eBPF+tracing must have 600 * this restriction, since it may pass kernel data to user space 601 */ 602 if (!capable(CAP_SYS_ADMIN)) 603 return -EPERM; 604 605 if (!access_ok(VERIFY_READ, uattr, 1)) 606 return -EFAULT; 607 608 if (size > PAGE_SIZE) /* silly large */ 609 return -E2BIG; 610 611 /* If we're handed a bigger struct than we know of, 612 * ensure all the unknown bits are 0 - i.e. new 613 * user-space does not rely on any kernel feature 614 * extensions we dont know about yet. 615 */ 616 if (size > sizeof(attr)) { 617 unsigned char __user *addr; 618 unsigned char __user *end; 619 unsigned char val; 620 621 addr = (void __user *)uattr + sizeof(attr); 622 end = (void __user *)uattr + size; 623 624 for (; addr < end; addr++) { 625 err = get_user(val, addr); 626 if (err) 627 return err; 628 if (val) 629 return -E2BIG; 630 } 631 size = sizeof(attr); 632 } 633 634 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 635 if (copy_from_user(&attr, uattr, size) != 0) 636 return -EFAULT; 637 638 switch (cmd) { 639 case BPF_MAP_CREATE: 640 err = map_create(&attr); 641 break; 642 case BPF_MAP_LOOKUP_ELEM: 643 err = map_lookup_elem(&attr); 644 break; 645 case BPF_MAP_UPDATE_ELEM: 646 err = map_update_elem(&attr); 647 break; 648 case BPF_MAP_DELETE_ELEM: 649 err = map_delete_elem(&attr); 650 break; 651 case BPF_MAP_GET_NEXT_KEY: 652 err = map_get_next_key(&attr); 653 break; 654 case BPF_PROG_LOAD: 655 err = bpf_prog_load(&attr); 656 break; 657 default: 658 err = -EINVAL; 659 break; 660 } 661 662 return err; 663 } 664