1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 #include <linux/poll.h> 16 17 struct bpf_struct_ops_value { 18 struct bpf_struct_ops_common_value common; 19 char data[] ____cacheline_aligned_in_smp; 20 }; 21 22 #define MAX_TRAMP_IMAGE_PAGES 8 23 24 struct bpf_struct_ops_map { 25 struct bpf_map map; 26 struct rcu_head rcu; 27 const struct bpf_struct_ops_desc *st_ops_desc; 28 /* protect map_update */ 29 struct mutex lock; 30 /* link has all the bpf_links that is populated 31 * to the func ptr of the kernel's struct 32 * (in kvalue.data). 33 */ 34 struct bpf_link **links; 35 u32 links_cnt; 36 u32 image_pages_cnt; 37 /* image_pages is an array of pages that has all the trampolines 38 * that stores the func args before calling the bpf_prog. 39 */ 40 void *image_pages[MAX_TRAMP_IMAGE_PAGES]; 41 /* The owner moduler's btf. */ 42 struct btf *btf; 43 /* uvalue->data stores the kernel struct 44 * (e.g. tcp_congestion_ops) that is more useful 45 * to userspace than the kvalue. For example, 46 * the bpf_prog's id is stored instead of the kernel 47 * address of a func ptr. 48 */ 49 struct bpf_struct_ops_value *uvalue; 50 /* kvalue.data stores the actual kernel's struct 51 * (e.g. tcp_congestion_ops) that will be 52 * registered to the kernel subsystem. 53 */ 54 struct bpf_struct_ops_value kvalue; 55 }; 56 57 struct bpf_struct_ops_link { 58 struct bpf_link link; 59 struct bpf_map __rcu *map; 60 wait_queue_head_t wait_hup; 61 }; 62 63 static DEFINE_MUTEX(update_mutex); 64 65 #define VALUE_PREFIX "bpf_struct_ops_" 66 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 67 68 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 69 }; 70 71 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 72 #ifdef CONFIG_NET 73 .test_run = bpf_struct_ops_test_run, 74 #endif 75 }; 76 77 BTF_ID_LIST(st_ops_ids) 78 BTF_ID(struct, module) 79 BTF_ID(struct, bpf_struct_ops_common_value) 80 81 enum { 82 IDX_MODULE_ID, 83 IDX_ST_OPS_COMMON_VALUE_ID, 84 }; 85 86 extern struct btf *btf_vmlinux; 87 88 static bool is_valid_value_type(struct btf *btf, s32 value_id, 89 const struct btf_type *type, 90 const char *value_name) 91 { 92 const struct btf_type *common_value_type; 93 const struct btf_member *member; 94 const struct btf_type *vt, *mt; 95 96 vt = btf_type_by_id(btf, value_id); 97 if (btf_vlen(vt) != 2) { 98 pr_warn("The number of %s's members should be 2, but we get %d\n", 99 value_name, btf_vlen(vt)); 100 return false; 101 } 102 member = btf_type_member(vt); 103 mt = btf_type_by_id(btf, member->type); 104 common_value_type = btf_type_by_id(btf_vmlinux, 105 st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); 106 if (mt != common_value_type) { 107 pr_warn("The first member of %s should be bpf_struct_ops_common_value\n", 108 value_name); 109 return false; 110 } 111 member++; 112 mt = btf_type_by_id(btf, member->type); 113 if (mt != type) { 114 pr_warn("The second member of %s should be %s\n", 115 value_name, btf_name_by_offset(btf, type->name_off)); 116 return false; 117 } 118 119 return true; 120 } 121 122 static void *bpf_struct_ops_image_alloc(void) 123 { 124 void *image; 125 int err; 126 127 err = bpf_jit_charge_modmem(PAGE_SIZE); 128 if (err) 129 return ERR_PTR(err); 130 image = arch_alloc_bpf_trampoline(PAGE_SIZE); 131 if (!image) { 132 bpf_jit_uncharge_modmem(PAGE_SIZE); 133 return ERR_PTR(-ENOMEM); 134 } 135 136 return image; 137 } 138 139 void bpf_struct_ops_image_free(void *image) 140 { 141 if (image) { 142 arch_free_bpf_trampoline(image, PAGE_SIZE); 143 bpf_jit_uncharge_modmem(PAGE_SIZE); 144 } 145 } 146 147 #define MAYBE_NULL_SUFFIX "__nullable" 148 #define MAX_STUB_NAME 128 149 150 /* Return the type info of a stub function, if it exists. 151 * 152 * The name of a stub function is made up of the name of the struct_ops and 153 * the name of the function pointer member, separated by "__". For example, 154 * if the struct_ops type is named "foo_ops" and the function pointer 155 * member is named "bar", the stub function name would be "foo_ops__bar". 156 */ 157 static const struct btf_type * 158 find_stub_func_proto(const struct btf *btf, const char *st_op_name, 159 const char *member_name) 160 { 161 char stub_func_name[MAX_STUB_NAME]; 162 const struct btf_type *func_type; 163 s32 btf_id; 164 int cp; 165 166 cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", 167 st_op_name, member_name); 168 if (cp >= MAX_STUB_NAME) { 169 pr_warn("Stub function name too long\n"); 170 return NULL; 171 } 172 btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); 173 if (btf_id < 0) 174 return NULL; 175 func_type = btf_type_by_id(btf, btf_id); 176 if (!func_type) 177 return NULL; 178 179 return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ 180 } 181 182 /* Prepare argument info for every nullable argument of a member of a 183 * struct_ops type. 184 * 185 * Initialize a struct bpf_struct_ops_arg_info according to type info of 186 * the arguments of a stub function. (Check kCFI for more information about 187 * stub functions.) 188 * 189 * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info 190 * to provide an array of struct bpf_ctx_arg_aux, which in turn provides 191 * the information that used by the verifier to check the arguments of the 192 * BPF struct_ops program assigned to the member. Here, we only care about 193 * the arguments that are marked as __nullable. 194 * 195 * The array of struct bpf_ctx_arg_aux is eventually assigned to 196 * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the 197 * verifier. (See check_struct_ops_btf_id()) 198 * 199 * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If 200 * fails, it will be kept untouched. 201 */ 202 static int prepare_arg_info(struct btf *btf, 203 const char *st_ops_name, 204 const char *member_name, 205 const struct btf_type *func_proto, 206 struct bpf_struct_ops_arg_info *arg_info) 207 { 208 const struct btf_type *stub_func_proto, *pointed_type; 209 const struct btf_param *stub_args, *args; 210 struct bpf_ctx_arg_aux *info, *info_buf; 211 u32 nargs, arg_no, info_cnt = 0; 212 u32 arg_btf_id; 213 int offset; 214 215 stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); 216 if (!stub_func_proto) 217 return 0; 218 219 /* Check if the number of arguments of the stub function is the same 220 * as the number of arguments of the function pointer. 221 */ 222 nargs = btf_type_vlen(func_proto); 223 if (nargs != btf_type_vlen(stub_func_proto)) { 224 pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", 225 st_ops_name, member_name, member_name, st_ops_name); 226 return -EINVAL; 227 } 228 229 if (!nargs) 230 return 0; 231 232 args = btf_params(func_proto); 233 stub_args = btf_params(stub_func_proto); 234 235 info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); 236 if (!info_buf) 237 return -ENOMEM; 238 239 /* Prepare info for every nullable argument */ 240 info = info_buf; 241 for (arg_no = 0; arg_no < nargs; arg_no++) { 242 /* Skip arguments that is not suffixed with 243 * "__nullable". 244 */ 245 if (!btf_param_match_suffix(btf, &stub_args[arg_no], 246 MAYBE_NULL_SUFFIX)) 247 continue; 248 249 /* Should be a pointer to struct */ 250 pointed_type = btf_type_resolve_ptr(btf, 251 args[arg_no].type, 252 &arg_btf_id); 253 if (!pointed_type || 254 !btf_type_is_struct(pointed_type)) { 255 pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", 256 st_ops_name, member_name, MAYBE_NULL_SUFFIX); 257 goto err_out; 258 } 259 260 offset = btf_ctx_arg_offset(btf, func_proto, arg_no); 261 if (offset < 0) { 262 pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", 263 st_ops_name, member_name, arg_no); 264 goto err_out; 265 } 266 267 if (args[arg_no].type != stub_args[arg_no].type) { 268 pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", 269 arg_no, st_ops_name, member_name); 270 goto err_out; 271 } 272 273 /* Fill the information of the new argument */ 274 info->reg_type = 275 PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; 276 info->btf_id = arg_btf_id; 277 info->btf = btf; 278 info->offset = offset; 279 280 info++; 281 info_cnt++; 282 } 283 284 if (info_cnt) { 285 arg_info->info = info_buf; 286 arg_info->cnt = info_cnt; 287 } else { 288 kfree(info_buf); 289 } 290 291 return 0; 292 293 err_out: 294 kfree(info_buf); 295 296 return -EINVAL; 297 } 298 299 /* Clean up the arg_info in a struct bpf_struct_ops_desc. */ 300 void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) 301 { 302 struct bpf_struct_ops_arg_info *arg_info; 303 int i; 304 305 arg_info = st_ops_desc->arg_info; 306 for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) 307 kfree(arg_info[i].info); 308 309 kfree(arg_info); 310 } 311 312 int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, 313 struct btf *btf, 314 struct bpf_verifier_log *log) 315 { 316 struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; 317 struct bpf_struct_ops_arg_info *arg_info; 318 const struct btf_member *member; 319 const struct btf_type *t; 320 s32 type_id, value_id; 321 char value_name[128]; 322 const char *mname; 323 int i, err; 324 325 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 326 sizeof(value_name)) { 327 pr_warn("struct_ops name %s is too long\n", 328 st_ops->name); 329 return -EINVAL; 330 } 331 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 332 333 if (!st_ops->cfi_stubs) { 334 pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name); 335 return -EINVAL; 336 } 337 338 type_id = btf_find_by_name_kind(btf, st_ops->name, 339 BTF_KIND_STRUCT); 340 if (type_id < 0) { 341 pr_warn("Cannot find struct %s in %s\n", 342 st_ops->name, btf_get_name(btf)); 343 return -EINVAL; 344 } 345 t = btf_type_by_id(btf, type_id); 346 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 347 pr_warn("Cannot support #%u members in struct %s\n", 348 btf_type_vlen(t), st_ops->name); 349 return -EINVAL; 350 } 351 352 value_id = btf_find_by_name_kind(btf, value_name, 353 BTF_KIND_STRUCT); 354 if (value_id < 0) { 355 pr_warn("Cannot find struct %s in %s\n", 356 value_name, btf_get_name(btf)); 357 return -EINVAL; 358 } 359 if (!is_valid_value_type(btf, value_id, t, value_name)) 360 return -EINVAL; 361 362 arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), 363 GFP_KERNEL); 364 if (!arg_info) 365 return -ENOMEM; 366 367 st_ops_desc->arg_info = arg_info; 368 st_ops_desc->type = t; 369 st_ops_desc->type_id = type_id; 370 st_ops_desc->value_id = value_id; 371 st_ops_desc->value_type = btf_type_by_id(btf, value_id); 372 373 for_each_member(i, t, member) { 374 const struct btf_type *func_proto; 375 376 mname = btf_name_by_offset(btf, member->name_off); 377 if (!*mname) { 378 pr_warn("anon member in struct %s is not supported\n", 379 st_ops->name); 380 err = -EOPNOTSUPP; 381 goto errout; 382 } 383 384 if (__btf_member_bitfield_size(t, member)) { 385 pr_warn("bit field member %s in struct %s is not supported\n", 386 mname, st_ops->name); 387 err = -EOPNOTSUPP; 388 goto errout; 389 } 390 391 func_proto = btf_type_resolve_func_ptr(btf, 392 member->type, 393 NULL); 394 if (!func_proto) 395 continue; 396 397 if (btf_distill_func_proto(log, btf, 398 func_proto, mname, 399 &st_ops->func_models[i])) { 400 pr_warn("Error in parsing func ptr %s in struct %s\n", 401 mname, st_ops->name); 402 err = -EINVAL; 403 goto errout; 404 } 405 406 err = prepare_arg_info(btf, st_ops->name, mname, 407 func_proto, 408 arg_info + i); 409 if (err) 410 goto errout; 411 } 412 413 if (st_ops->init(btf)) { 414 pr_warn("Error in init bpf_struct_ops %s\n", 415 st_ops->name); 416 err = -EINVAL; 417 goto errout; 418 } 419 420 return 0; 421 422 errout: 423 bpf_struct_ops_desc_release(st_ops_desc); 424 425 return err; 426 } 427 428 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 429 void *next_key) 430 { 431 if (key && *(u32 *)key == 0) 432 return -ENOENT; 433 434 *(u32 *)next_key = 0; 435 return 0; 436 } 437 438 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 439 void *value) 440 { 441 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 442 struct bpf_struct_ops_value *uvalue, *kvalue; 443 enum bpf_struct_ops_state state; 444 s64 refcnt; 445 446 if (unlikely(*(u32 *)key != 0)) 447 return -ENOENT; 448 449 kvalue = &st_map->kvalue; 450 /* Pair with smp_store_release() during map_update */ 451 state = smp_load_acquire(&kvalue->common.state); 452 if (state == BPF_STRUCT_OPS_STATE_INIT) { 453 memset(value, 0, map->value_size); 454 return 0; 455 } 456 457 /* No lock is needed. state and refcnt do not need 458 * to be updated together under atomic context. 459 */ 460 uvalue = value; 461 memcpy(uvalue, st_map->uvalue, map->value_size); 462 uvalue->common.state = state; 463 464 /* This value offers the user space a general estimate of how 465 * many sockets are still utilizing this struct_ops for TCP 466 * congestion control. The number might not be exact, but it 467 * should sufficiently meet our present goals. 468 */ 469 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 470 refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0)); 471 472 return 0; 473 } 474 475 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 476 { 477 return ERR_PTR(-EINVAL); 478 } 479 480 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 481 { 482 u32 i; 483 484 for (i = 0; i < st_map->links_cnt; i++) { 485 if (st_map->links[i]) { 486 bpf_link_put(st_map->links[i]); 487 st_map->links[i] = NULL; 488 } 489 } 490 } 491 492 static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) 493 { 494 int i; 495 496 for (i = 0; i < st_map->image_pages_cnt; i++) 497 bpf_struct_ops_image_free(st_map->image_pages[i]); 498 st_map->image_pages_cnt = 0; 499 } 500 501 static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) 502 { 503 const struct btf_member *member; 504 u32 i, moff, msize, prev_mend = 0; 505 const struct btf_type *mtype; 506 507 for_each_member(i, t, member) { 508 moff = __btf_member_bit_offset(t, member) / 8; 509 if (moff > prev_mend && 510 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 511 return -EINVAL; 512 513 mtype = btf_type_by_id(btf, member->type); 514 mtype = btf_resolve_size(btf, mtype, &msize); 515 if (IS_ERR(mtype)) 516 return PTR_ERR(mtype); 517 prev_mend = moff + msize; 518 } 519 520 if (t->size > prev_mend && 521 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 522 return -EINVAL; 523 524 return 0; 525 } 526 527 static void bpf_struct_ops_link_release(struct bpf_link *link) 528 { 529 } 530 531 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 532 { 533 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 534 535 kfree(tlink); 536 } 537 538 const struct bpf_link_ops bpf_struct_ops_link_lops = { 539 .release = bpf_struct_ops_link_release, 540 .dealloc = bpf_struct_ops_link_dealloc, 541 }; 542 543 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 544 struct bpf_tramp_link *link, 545 const struct btf_func_model *model, 546 void *stub_func, 547 void **_image, u32 *_image_off, 548 bool allow_alloc) 549 { 550 u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT; 551 void *image = *_image; 552 int size; 553 554 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 555 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 556 557 if (model->ret_size > 0) 558 flags |= BPF_TRAMP_F_RET_FENTRY_RET; 559 560 size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); 561 if (size <= 0) 562 return size ? : -EFAULT; 563 564 /* Allocate image buffer if necessary */ 565 if (!image || size > PAGE_SIZE - image_off) { 566 if (!allow_alloc) 567 return -E2BIG; 568 569 image = bpf_struct_ops_image_alloc(); 570 if (IS_ERR(image)) 571 return PTR_ERR(image); 572 image_off = 0; 573 } 574 575 size = arch_prepare_bpf_trampoline(NULL, image + image_off, 576 image + image_off + size, 577 model, flags, tlinks, stub_func); 578 if (size <= 0) { 579 if (image != *_image) 580 bpf_struct_ops_image_free(image); 581 return size ? : -EFAULT; 582 } 583 584 *_image = image; 585 *_image_off = image_off + size; 586 return 0; 587 } 588 589 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 590 void *value, u64 flags) 591 { 592 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 593 const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; 594 const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; 595 struct bpf_struct_ops_value *uvalue, *kvalue; 596 const struct btf_type *module_type; 597 const struct btf_member *member; 598 const struct btf_type *t = st_ops_desc->type; 599 struct bpf_tramp_links *tlinks; 600 void *udata, *kdata; 601 int prog_fd, err; 602 u32 i, trampoline_start, image_off = 0; 603 void *cur_image = NULL, *image = NULL; 604 605 if (flags) 606 return -EINVAL; 607 608 if (*(u32 *)key != 0) 609 return -E2BIG; 610 611 err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value); 612 if (err) 613 return err; 614 615 uvalue = value; 616 err = check_zero_holes(st_map->btf, t, uvalue->data); 617 if (err) 618 return err; 619 620 if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) 621 return -EINVAL; 622 623 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 624 if (!tlinks) 625 return -ENOMEM; 626 627 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 628 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 629 630 mutex_lock(&st_map->lock); 631 632 if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) { 633 err = -EBUSY; 634 goto unlock; 635 } 636 637 memcpy(uvalue, value, map->value_size); 638 639 udata = &uvalue->data; 640 kdata = &kvalue->data; 641 642 module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); 643 for_each_member(i, t, member) { 644 const struct btf_type *mtype, *ptype; 645 struct bpf_prog *prog; 646 struct bpf_tramp_link *link; 647 u32 moff; 648 649 moff = __btf_member_bit_offset(t, member) / 8; 650 ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); 651 if (ptype == module_type) { 652 if (*(void **)(udata + moff)) 653 goto reset_unlock; 654 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 655 continue; 656 } 657 658 err = st_ops->init_member(t, member, kdata, udata); 659 if (err < 0) 660 goto reset_unlock; 661 662 /* The ->init_member() has handled this member */ 663 if (err > 0) 664 continue; 665 666 /* If st_ops->init_member does not handle it, 667 * we will only handle func ptrs and zero-ed members 668 * here. Reject everything else. 669 */ 670 671 /* All non func ptr member must be 0 */ 672 if (!ptype || !btf_type_is_func_proto(ptype)) { 673 u32 msize; 674 675 mtype = btf_type_by_id(st_map->btf, member->type); 676 mtype = btf_resolve_size(st_map->btf, mtype, &msize); 677 if (IS_ERR(mtype)) { 678 err = PTR_ERR(mtype); 679 goto reset_unlock; 680 } 681 682 if (memchr_inv(udata + moff, 0, msize)) { 683 err = -EINVAL; 684 goto reset_unlock; 685 } 686 687 continue; 688 } 689 690 prog_fd = (int)(*(unsigned long *)(udata + moff)); 691 /* Similar check as the attr->attach_prog_fd */ 692 if (!prog_fd) 693 continue; 694 695 prog = bpf_prog_get(prog_fd); 696 if (IS_ERR(prog)) { 697 err = PTR_ERR(prog); 698 goto reset_unlock; 699 } 700 701 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 702 prog->aux->attach_btf_id != st_ops_desc->type_id || 703 prog->expected_attach_type != i) { 704 bpf_prog_put(prog); 705 err = -EINVAL; 706 goto reset_unlock; 707 } 708 709 link = kzalloc(sizeof(*link), GFP_USER); 710 if (!link) { 711 bpf_prog_put(prog); 712 err = -ENOMEM; 713 goto reset_unlock; 714 } 715 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 716 &bpf_struct_ops_link_lops, prog); 717 st_map->links[i] = &link->link; 718 719 trampoline_start = image_off; 720 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 721 &st_ops->func_models[i], 722 *(void **)(st_ops->cfi_stubs + moff), 723 &image, &image_off, 724 st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); 725 if (err) 726 goto reset_unlock; 727 728 if (cur_image != image) { 729 st_map->image_pages[st_map->image_pages_cnt++] = image; 730 cur_image = image; 731 trampoline_start = 0; 732 } 733 734 *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); 735 736 /* put prog_id to udata */ 737 *(unsigned long *)(udata + moff) = prog->aux->id; 738 } 739 740 if (st_ops->validate) { 741 err = st_ops->validate(kdata); 742 if (err) 743 goto reset_unlock; 744 } 745 for (i = 0; i < st_map->image_pages_cnt; i++) { 746 err = arch_protect_bpf_trampoline(st_map->image_pages[i], 747 PAGE_SIZE); 748 if (err) 749 goto reset_unlock; 750 } 751 752 if (st_map->map.map_flags & BPF_F_LINK) { 753 err = 0; 754 /* Let bpf_link handle registration & unregistration. 755 * 756 * Pair with smp_load_acquire() during lookup_elem(). 757 */ 758 smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); 759 goto unlock; 760 } 761 762 err = st_ops->reg(kdata, NULL); 763 if (likely(!err)) { 764 /* This refcnt increment on the map here after 765 * 'st_ops->reg()' is secure since the state of the 766 * map must be set to INIT at this moment, and thus 767 * bpf_struct_ops_map_delete_elem() can't unregister 768 * or transition it to TOBEFREE concurrently. 769 */ 770 bpf_map_inc(map); 771 /* Pair with smp_load_acquire() during lookup_elem(). 772 * It ensures the above udata updates (e.g. prog->aux->id) 773 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 774 */ 775 smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); 776 goto unlock; 777 } 778 779 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 780 * verified as a whole, after all init_member() calls. Can also happen if 781 * there was a race in registering the struct_ops (under the same name) to 782 * a sub-system through different struct_ops's maps. 783 */ 784 785 reset_unlock: 786 bpf_struct_ops_map_free_image(st_map); 787 bpf_struct_ops_map_put_progs(st_map); 788 memset(uvalue, 0, map->value_size); 789 memset(kvalue, 0, map->value_size); 790 unlock: 791 kfree(tlinks); 792 mutex_unlock(&st_map->lock); 793 return err; 794 } 795 796 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 797 { 798 enum bpf_struct_ops_state prev_state; 799 struct bpf_struct_ops_map *st_map; 800 801 st_map = (struct bpf_struct_ops_map *)map; 802 if (st_map->map.map_flags & BPF_F_LINK) 803 return -EOPNOTSUPP; 804 805 prev_state = cmpxchg(&st_map->kvalue.common.state, 806 BPF_STRUCT_OPS_STATE_INUSE, 807 BPF_STRUCT_OPS_STATE_TOBEFREE); 808 switch (prev_state) { 809 case BPF_STRUCT_OPS_STATE_INUSE: 810 st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL); 811 bpf_map_put(map); 812 return 0; 813 case BPF_STRUCT_OPS_STATE_TOBEFREE: 814 return -EINPROGRESS; 815 case BPF_STRUCT_OPS_STATE_INIT: 816 return -ENOENT; 817 default: 818 WARN_ON_ONCE(1); 819 /* Should never happen. Treat it as not found. */ 820 return -ENOENT; 821 } 822 } 823 824 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 825 struct seq_file *m) 826 { 827 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 828 void *value; 829 int err; 830 831 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 832 if (!value) 833 return; 834 835 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 836 if (!err) { 837 btf_type_seq_show(st_map->btf, 838 map->btf_vmlinux_value_type_id, 839 value, m); 840 seq_puts(m, "\n"); 841 } 842 843 kfree(value); 844 } 845 846 static void __bpf_struct_ops_map_free(struct bpf_map *map) 847 { 848 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 849 850 if (st_map->links) 851 bpf_struct_ops_map_put_progs(st_map); 852 bpf_map_area_free(st_map->links); 853 bpf_struct_ops_map_free_image(st_map); 854 bpf_map_area_free(st_map->uvalue); 855 bpf_map_area_free(st_map); 856 } 857 858 static void bpf_struct_ops_map_free(struct bpf_map *map) 859 { 860 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 861 862 /* st_ops->owner was acquired during map_alloc to implicitly holds 863 * the btf's refcnt. The acquire was only done when btf_is_module() 864 * st_map->btf cannot be NULL here. 865 */ 866 if (btf_is_module(st_map->btf)) 867 module_put(st_map->st_ops_desc->st_ops->owner); 868 869 /* The struct_ops's function may switch to another struct_ops. 870 * 871 * For example, bpf_tcp_cc_x->init() may switch to 872 * another tcp_cc_y by calling 873 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 874 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 875 * and its refcount may reach 0 which then free its 876 * trampoline image while tcp_cc_x is still running. 877 * 878 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 879 * to finish. bpf-tcp-cc prog is non sleepable. 880 * A rcu_tasks gp is to wait for the last few insn 881 * in the tramopline image to finish before releasing 882 * the trampoline image. 883 */ 884 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 885 886 __bpf_struct_ops_map_free(map); 887 } 888 889 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 890 { 891 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 892 (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) || 893 !attr->btf_vmlinux_value_type_id) 894 return -EINVAL; 895 return 0; 896 } 897 898 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 899 { 900 const struct bpf_struct_ops_desc *st_ops_desc; 901 size_t st_map_size; 902 struct bpf_struct_ops_map *st_map; 903 const struct btf_type *t, *vt; 904 struct module *mod = NULL; 905 struct bpf_map *map; 906 struct btf *btf; 907 int ret; 908 909 if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { 910 /* The map holds btf for its whole life time. */ 911 btf = btf_get_by_fd(attr->value_type_btf_obj_fd); 912 if (IS_ERR(btf)) 913 return ERR_CAST(btf); 914 if (!btf_is_module(btf)) { 915 btf_put(btf); 916 return ERR_PTR(-EINVAL); 917 } 918 919 mod = btf_try_get_module(btf); 920 /* mod holds a refcnt to btf. We don't need an extra refcnt 921 * here. 922 */ 923 btf_put(btf); 924 if (!mod) 925 return ERR_PTR(-EINVAL); 926 } else { 927 btf = bpf_get_btf_vmlinux(); 928 if (IS_ERR(btf)) 929 return ERR_CAST(btf); 930 if (!btf) 931 return ERR_PTR(-ENOTSUPP); 932 } 933 934 st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); 935 if (!st_ops_desc) { 936 ret = -ENOTSUPP; 937 goto errout; 938 } 939 940 vt = st_ops_desc->value_type; 941 if (attr->value_size != vt->size) { 942 ret = -EINVAL; 943 goto errout; 944 } 945 946 t = st_ops_desc->type; 947 948 st_map_size = sizeof(*st_map) + 949 /* kvalue stores the 950 * struct bpf_struct_ops_tcp_congestions_ops 951 */ 952 (vt->size - sizeof(struct bpf_struct_ops_value)); 953 954 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 955 if (!st_map) { 956 ret = -ENOMEM; 957 goto errout; 958 } 959 960 st_map->st_ops_desc = st_ops_desc; 961 map = &st_map->map; 962 963 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 964 st_map->links_cnt = btf_type_vlen(t); 965 st_map->links = 966 bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), 967 NUMA_NO_NODE); 968 if (!st_map->uvalue || !st_map->links) { 969 ret = -ENOMEM; 970 goto errout_free; 971 } 972 st_map->btf = btf; 973 974 mutex_init(&st_map->lock); 975 bpf_map_init_from_attr(map, attr); 976 977 return map; 978 979 errout_free: 980 __bpf_struct_ops_map_free(map); 981 errout: 982 module_put(mod); 983 984 return ERR_PTR(ret); 985 } 986 987 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 988 { 989 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 990 const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; 991 const struct btf_type *vt = st_ops_desc->value_type; 992 u64 usage; 993 994 usage = sizeof(*st_map) + 995 vt->size - sizeof(struct bpf_struct_ops_value); 996 usage += vt->size; 997 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 998 usage += PAGE_SIZE; 999 return usage; 1000 } 1001 1002 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 1003 const struct bpf_map_ops bpf_struct_ops_map_ops = { 1004 .map_alloc_check = bpf_struct_ops_map_alloc_check, 1005 .map_alloc = bpf_struct_ops_map_alloc, 1006 .map_free = bpf_struct_ops_map_free, 1007 .map_get_next_key = bpf_struct_ops_map_get_next_key, 1008 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 1009 .map_delete_elem = bpf_struct_ops_map_delete_elem, 1010 .map_update_elem = bpf_struct_ops_map_update_elem, 1011 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 1012 .map_mem_usage = bpf_struct_ops_map_mem_usage, 1013 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 1014 }; 1015 1016 /* "const void *" because some subsystem is 1017 * passing a const (e.g. const struct tcp_congestion_ops *) 1018 */ 1019 bool bpf_struct_ops_get(const void *kdata) 1020 { 1021 struct bpf_struct_ops_value *kvalue; 1022 struct bpf_struct_ops_map *st_map; 1023 struct bpf_map *map; 1024 1025 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 1026 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 1027 1028 map = __bpf_map_inc_not_zero(&st_map->map, false); 1029 return !IS_ERR(map); 1030 } 1031 1032 void bpf_struct_ops_put(const void *kdata) 1033 { 1034 struct bpf_struct_ops_value *kvalue; 1035 struct bpf_struct_ops_map *st_map; 1036 1037 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 1038 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 1039 1040 bpf_map_put(&st_map->map); 1041 } 1042 1043 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 1044 { 1045 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 1046 1047 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 1048 map->map_flags & BPF_F_LINK && 1049 /* Pair with smp_store_release() during map_update */ 1050 smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY; 1051 } 1052 1053 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 1054 { 1055 struct bpf_struct_ops_link *st_link; 1056 struct bpf_struct_ops_map *st_map; 1057 1058 st_link = container_of(link, struct bpf_struct_ops_link, link); 1059 st_map = (struct bpf_struct_ops_map *) 1060 rcu_dereference_protected(st_link->map, true); 1061 if (st_map) { 1062 st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); 1063 bpf_map_put(&st_map->map); 1064 } 1065 kfree(st_link); 1066 } 1067 1068 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 1069 struct seq_file *seq) 1070 { 1071 struct bpf_struct_ops_link *st_link; 1072 struct bpf_map *map; 1073 1074 st_link = container_of(link, struct bpf_struct_ops_link, link); 1075 rcu_read_lock(); 1076 map = rcu_dereference(st_link->map); 1077 if (map) 1078 seq_printf(seq, "map_id:\t%d\n", map->id); 1079 rcu_read_unlock(); 1080 } 1081 1082 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 1083 struct bpf_link_info *info) 1084 { 1085 struct bpf_struct_ops_link *st_link; 1086 struct bpf_map *map; 1087 1088 st_link = container_of(link, struct bpf_struct_ops_link, link); 1089 rcu_read_lock(); 1090 map = rcu_dereference(st_link->map); 1091 if (map) 1092 info->struct_ops.map_id = map->id; 1093 rcu_read_unlock(); 1094 return 0; 1095 } 1096 1097 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 1098 struct bpf_map *expected_old_map) 1099 { 1100 struct bpf_struct_ops_map *st_map, *old_st_map; 1101 struct bpf_map *old_map; 1102 struct bpf_struct_ops_link *st_link; 1103 int err; 1104 1105 st_link = container_of(link, struct bpf_struct_ops_link, link); 1106 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 1107 1108 if (!bpf_struct_ops_valid_to_reg(new_map)) 1109 return -EINVAL; 1110 1111 if (!st_map->st_ops_desc->st_ops->update) 1112 return -EOPNOTSUPP; 1113 1114 mutex_lock(&update_mutex); 1115 1116 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 1117 if (!old_map) { 1118 err = -ENOLINK; 1119 goto err_out; 1120 } 1121 if (expected_old_map && old_map != expected_old_map) { 1122 err = -EPERM; 1123 goto err_out; 1124 } 1125 1126 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 1127 /* The new and old struct_ops must be the same type. */ 1128 if (st_map->st_ops_desc != old_st_map->st_ops_desc) { 1129 err = -EINVAL; 1130 goto err_out; 1131 } 1132 1133 err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); 1134 if (err) 1135 goto err_out; 1136 1137 bpf_map_inc(new_map); 1138 rcu_assign_pointer(st_link->map, new_map); 1139 bpf_map_put(old_map); 1140 1141 err_out: 1142 mutex_unlock(&update_mutex); 1143 1144 return err; 1145 } 1146 1147 static int bpf_struct_ops_map_link_detach(struct bpf_link *link) 1148 { 1149 struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); 1150 struct bpf_struct_ops_map *st_map; 1151 struct bpf_map *map; 1152 1153 mutex_lock(&update_mutex); 1154 1155 map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 1156 if (!map) { 1157 mutex_unlock(&update_mutex); 1158 return 0; 1159 } 1160 st_map = container_of(map, struct bpf_struct_ops_map, map); 1161 1162 st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); 1163 1164 RCU_INIT_POINTER(st_link->map, NULL); 1165 /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or 1166 * bpf_map_inc() in bpf_struct_ops_map_link_update(). 1167 */ 1168 bpf_map_put(&st_map->map); 1169 1170 mutex_unlock(&update_mutex); 1171 1172 wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); 1173 1174 return 0; 1175 } 1176 1177 static __poll_t bpf_struct_ops_map_link_poll(struct file *file, 1178 struct poll_table_struct *pts) 1179 { 1180 struct bpf_struct_ops_link *st_link = file->private_data; 1181 1182 poll_wait(file, &st_link->wait_hup, pts); 1183 1184 return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; 1185 } 1186 1187 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 1188 .dealloc = bpf_struct_ops_map_link_dealloc, 1189 .detach = bpf_struct_ops_map_link_detach, 1190 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 1191 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 1192 .update_map = bpf_struct_ops_map_link_update, 1193 .poll = bpf_struct_ops_map_link_poll, 1194 }; 1195 1196 int bpf_struct_ops_link_create(union bpf_attr *attr) 1197 { 1198 struct bpf_struct_ops_link *link = NULL; 1199 struct bpf_link_primer link_primer; 1200 struct bpf_struct_ops_map *st_map; 1201 struct bpf_map *map; 1202 int err; 1203 1204 map = bpf_map_get(attr->link_create.map_fd); 1205 if (IS_ERR(map)) 1206 return PTR_ERR(map); 1207 1208 st_map = (struct bpf_struct_ops_map *)map; 1209 1210 if (!bpf_struct_ops_valid_to_reg(map)) { 1211 err = -EINVAL; 1212 goto err_out; 1213 } 1214 1215 link = kzalloc(sizeof(*link), GFP_USER); 1216 if (!link) { 1217 err = -ENOMEM; 1218 goto err_out; 1219 } 1220 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 1221 1222 err = bpf_link_prime(&link->link, &link_primer); 1223 if (err) 1224 goto err_out; 1225 1226 init_waitqueue_head(&link->wait_hup); 1227 1228 /* Hold the update_mutex such that the subsystem cannot 1229 * do link->ops->detach() before the link is fully initialized. 1230 */ 1231 mutex_lock(&update_mutex); 1232 err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); 1233 if (err) { 1234 mutex_unlock(&update_mutex); 1235 bpf_link_cleanup(&link_primer); 1236 link = NULL; 1237 goto err_out; 1238 } 1239 RCU_INIT_POINTER(link->map, map); 1240 mutex_unlock(&update_mutex); 1241 1242 return bpf_link_settle(&link_primer); 1243 1244 err_out: 1245 bpf_map_put(map); 1246 kfree(link); 1247 return err; 1248 } 1249 1250 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) 1251 { 1252 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 1253 1254 info->btf_vmlinux_id = btf_obj_id(st_map->btf); 1255 } 1256