1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 16 enum bpf_struct_ops_state { 17 BPF_STRUCT_OPS_STATE_INIT, 18 BPF_STRUCT_OPS_STATE_INUSE, 19 BPF_STRUCT_OPS_STATE_TOBEFREE, 20 BPF_STRUCT_OPS_STATE_READY, 21 }; 22 23 #define BPF_STRUCT_OPS_COMMON_VALUE \ 24 refcount_t refcnt; \ 25 enum bpf_struct_ops_state state 26 27 struct bpf_struct_ops_value { 28 BPF_STRUCT_OPS_COMMON_VALUE; 29 char data[] ____cacheline_aligned_in_smp; 30 }; 31 32 struct bpf_struct_ops_map { 33 struct bpf_map map; 34 struct rcu_head rcu; 35 const struct bpf_struct_ops *st_ops; 36 /* protect map_update */ 37 struct mutex lock; 38 /* link has all the bpf_links that is populated 39 * to the func ptr of the kernel's struct 40 * (in kvalue.data). 41 */ 42 struct bpf_link **links; 43 /* image is a page that has all the trampolines 44 * that stores the func args before calling the bpf_prog. 45 * A PAGE_SIZE "image" is enough to store all trampoline for 46 * "links[]". 47 */ 48 void *image; 49 /* uvalue->data stores the kernel struct 50 * (e.g. tcp_congestion_ops) that is more useful 51 * to userspace than the kvalue. For example, 52 * the bpf_prog's id is stored instead of the kernel 53 * address of a func ptr. 54 */ 55 struct bpf_struct_ops_value *uvalue; 56 /* kvalue.data stores the actual kernel's struct 57 * (e.g. tcp_congestion_ops) that will be 58 * registered to the kernel subsystem. 59 */ 60 struct bpf_struct_ops_value kvalue; 61 }; 62 63 struct bpf_struct_ops_link { 64 struct bpf_link link; 65 struct bpf_map __rcu *map; 66 }; 67 68 static DEFINE_MUTEX(update_mutex); 69 70 #define VALUE_PREFIX "bpf_struct_ops_" 71 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 72 73 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 74 * the map's value exposed to the userspace and its btf-type-id is 75 * stored at the map->btf_vmlinux_value_type_id. 76 * 77 */ 78 #define BPF_STRUCT_OPS_TYPE(_name) \ 79 extern struct bpf_struct_ops bpf_##_name; \ 80 \ 81 struct bpf_struct_ops_##_name { \ 82 BPF_STRUCT_OPS_COMMON_VALUE; \ 83 struct _name data ____cacheline_aligned_in_smp; \ 84 }; 85 #include "bpf_struct_ops_types.h" 86 #undef BPF_STRUCT_OPS_TYPE 87 88 enum { 89 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 90 #include "bpf_struct_ops_types.h" 91 #undef BPF_STRUCT_OPS_TYPE 92 __NR_BPF_STRUCT_OPS_TYPE, 93 }; 94 95 static struct bpf_struct_ops * const bpf_struct_ops[] = { 96 #define BPF_STRUCT_OPS_TYPE(_name) \ 97 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 98 #include "bpf_struct_ops_types.h" 99 #undef BPF_STRUCT_OPS_TYPE 100 }; 101 102 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 103 }; 104 105 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 106 #ifdef CONFIG_NET 107 .test_run = bpf_struct_ops_test_run, 108 #endif 109 }; 110 111 static const struct btf_type *module_type; 112 113 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 114 { 115 s32 type_id, value_id, module_id; 116 const struct btf_member *member; 117 struct bpf_struct_ops *st_ops; 118 const struct btf_type *t; 119 char value_name[128]; 120 const char *mname; 121 u32 i, j; 122 123 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 124 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 125 #include "bpf_struct_ops_types.h" 126 #undef BPF_STRUCT_OPS_TYPE 127 128 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 129 if (module_id < 0) { 130 pr_warn("Cannot find struct module in btf_vmlinux\n"); 131 return; 132 } 133 module_type = btf_type_by_id(btf, module_id); 134 135 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 136 st_ops = bpf_struct_ops[i]; 137 138 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 139 sizeof(value_name)) { 140 pr_warn("struct_ops name %s is too long\n", 141 st_ops->name); 142 continue; 143 } 144 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 145 146 value_id = btf_find_by_name_kind(btf, value_name, 147 BTF_KIND_STRUCT); 148 if (value_id < 0) { 149 pr_warn("Cannot find struct %s in btf_vmlinux\n", 150 value_name); 151 continue; 152 } 153 154 type_id = btf_find_by_name_kind(btf, st_ops->name, 155 BTF_KIND_STRUCT); 156 if (type_id < 0) { 157 pr_warn("Cannot find struct %s in btf_vmlinux\n", 158 st_ops->name); 159 continue; 160 } 161 t = btf_type_by_id(btf, type_id); 162 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 163 pr_warn("Cannot support #%u members in struct %s\n", 164 btf_type_vlen(t), st_ops->name); 165 continue; 166 } 167 168 for_each_member(j, t, member) { 169 const struct btf_type *func_proto; 170 171 mname = btf_name_by_offset(btf, member->name_off); 172 if (!*mname) { 173 pr_warn("anon member in struct %s is not supported\n", 174 st_ops->name); 175 break; 176 } 177 178 if (__btf_member_bitfield_size(t, member)) { 179 pr_warn("bit field member %s in struct %s is not supported\n", 180 mname, st_ops->name); 181 break; 182 } 183 184 func_proto = btf_type_resolve_func_ptr(btf, 185 member->type, 186 NULL); 187 if (func_proto && 188 btf_distill_func_proto(log, btf, 189 func_proto, mname, 190 &st_ops->func_models[j])) { 191 pr_warn("Error in parsing func ptr %s in struct %s\n", 192 mname, st_ops->name); 193 break; 194 } 195 } 196 197 if (j == btf_type_vlen(t)) { 198 if (st_ops->init(btf)) { 199 pr_warn("Error in init bpf_struct_ops %s\n", 200 st_ops->name); 201 } else { 202 st_ops->type_id = type_id; 203 st_ops->type = t; 204 st_ops->value_id = value_id; 205 st_ops->value_type = btf_type_by_id(btf, 206 value_id); 207 } 208 } 209 } 210 } 211 212 extern struct btf *btf_vmlinux; 213 214 static const struct bpf_struct_ops * 215 bpf_struct_ops_find_value(u32 value_id) 216 { 217 unsigned int i; 218 219 if (!value_id || !btf_vmlinux) 220 return NULL; 221 222 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 223 if (bpf_struct_ops[i]->value_id == value_id) 224 return bpf_struct_ops[i]; 225 } 226 227 return NULL; 228 } 229 230 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 231 { 232 unsigned int i; 233 234 if (!type_id || !btf_vmlinux) 235 return NULL; 236 237 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 238 if (bpf_struct_ops[i]->type_id == type_id) 239 return bpf_struct_ops[i]; 240 } 241 242 return NULL; 243 } 244 245 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 246 void *next_key) 247 { 248 if (key && *(u32 *)key == 0) 249 return -ENOENT; 250 251 *(u32 *)next_key = 0; 252 return 0; 253 } 254 255 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 256 void *value) 257 { 258 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 259 struct bpf_struct_ops_value *uvalue, *kvalue; 260 enum bpf_struct_ops_state state; 261 s64 refcnt; 262 263 if (unlikely(*(u32 *)key != 0)) 264 return -ENOENT; 265 266 kvalue = &st_map->kvalue; 267 /* Pair with smp_store_release() during map_update */ 268 state = smp_load_acquire(&kvalue->state); 269 if (state == BPF_STRUCT_OPS_STATE_INIT) { 270 memset(value, 0, map->value_size); 271 return 0; 272 } 273 274 /* No lock is needed. state and refcnt do not need 275 * to be updated together under atomic context. 276 */ 277 uvalue = value; 278 memcpy(uvalue, st_map->uvalue, map->value_size); 279 uvalue->state = state; 280 281 /* This value offers the user space a general estimate of how 282 * many sockets are still utilizing this struct_ops for TCP 283 * congestion control. The number might not be exact, but it 284 * should sufficiently meet our present goals. 285 */ 286 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 287 refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); 288 289 return 0; 290 } 291 292 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 293 { 294 return ERR_PTR(-EINVAL); 295 } 296 297 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 298 { 299 const struct btf_type *t = st_map->st_ops->type; 300 u32 i; 301 302 for (i = 0; i < btf_type_vlen(t); i++) { 303 if (st_map->links[i]) { 304 bpf_link_put(st_map->links[i]); 305 st_map->links[i] = NULL; 306 } 307 } 308 } 309 310 static int check_zero_holes(const struct btf_type *t, void *data) 311 { 312 const struct btf_member *member; 313 u32 i, moff, msize, prev_mend = 0; 314 const struct btf_type *mtype; 315 316 for_each_member(i, t, member) { 317 moff = __btf_member_bit_offset(t, member) / 8; 318 if (moff > prev_mend && 319 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 320 return -EINVAL; 321 322 mtype = btf_type_by_id(btf_vmlinux, member->type); 323 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 324 if (IS_ERR(mtype)) 325 return PTR_ERR(mtype); 326 prev_mend = moff + msize; 327 } 328 329 if (t->size > prev_mend && 330 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 331 return -EINVAL; 332 333 return 0; 334 } 335 336 static void bpf_struct_ops_link_release(struct bpf_link *link) 337 { 338 } 339 340 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 341 { 342 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 343 344 kfree(tlink); 345 } 346 347 const struct bpf_link_ops bpf_struct_ops_link_lops = { 348 .release = bpf_struct_ops_link_release, 349 .dealloc = bpf_struct_ops_link_dealloc, 350 }; 351 352 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 353 struct bpf_tramp_link *link, 354 const struct btf_func_model *model, 355 void *stub_func, void *image, void *image_end) 356 { 357 u32 flags = BPF_TRAMP_F_INDIRECT; 358 int size; 359 360 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 361 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 362 363 if (model->ret_size > 0) 364 flags |= BPF_TRAMP_F_RET_FENTRY_RET; 365 366 size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); 367 if (size < 0) 368 return size; 369 if (size > (unsigned long)image_end - (unsigned long)image) 370 return -E2BIG; 371 return arch_prepare_bpf_trampoline(NULL, image, image_end, 372 model, flags, tlinks, stub_func); 373 } 374 375 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 376 void *value, u64 flags) 377 { 378 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 379 const struct bpf_struct_ops *st_ops = st_map->st_ops; 380 struct bpf_struct_ops_value *uvalue, *kvalue; 381 const struct btf_member *member; 382 const struct btf_type *t = st_ops->type; 383 struct bpf_tramp_links *tlinks; 384 void *udata, *kdata; 385 int prog_fd, err; 386 void *image, *image_end; 387 u32 i; 388 389 if (flags) 390 return -EINVAL; 391 392 if (*(u32 *)key != 0) 393 return -E2BIG; 394 395 err = check_zero_holes(st_ops->value_type, value); 396 if (err) 397 return err; 398 399 uvalue = value; 400 err = check_zero_holes(t, uvalue->data); 401 if (err) 402 return err; 403 404 if (uvalue->state || refcount_read(&uvalue->refcnt)) 405 return -EINVAL; 406 407 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 408 if (!tlinks) 409 return -ENOMEM; 410 411 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 412 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 413 414 mutex_lock(&st_map->lock); 415 416 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 417 err = -EBUSY; 418 goto unlock; 419 } 420 421 memcpy(uvalue, value, map->value_size); 422 423 udata = &uvalue->data; 424 kdata = &kvalue->data; 425 image = st_map->image; 426 image_end = st_map->image + PAGE_SIZE; 427 428 for_each_member(i, t, member) { 429 const struct btf_type *mtype, *ptype; 430 struct bpf_prog *prog; 431 struct bpf_tramp_link *link; 432 u32 moff; 433 434 moff = __btf_member_bit_offset(t, member) / 8; 435 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 436 if (ptype == module_type) { 437 if (*(void **)(udata + moff)) 438 goto reset_unlock; 439 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 440 continue; 441 } 442 443 err = st_ops->init_member(t, member, kdata, udata); 444 if (err < 0) 445 goto reset_unlock; 446 447 /* The ->init_member() has handled this member */ 448 if (err > 0) 449 continue; 450 451 /* If st_ops->init_member does not handle it, 452 * we will only handle func ptrs and zero-ed members 453 * here. Reject everything else. 454 */ 455 456 /* All non func ptr member must be 0 */ 457 if (!ptype || !btf_type_is_func_proto(ptype)) { 458 u32 msize; 459 460 mtype = btf_type_by_id(btf_vmlinux, member->type); 461 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 462 if (IS_ERR(mtype)) { 463 err = PTR_ERR(mtype); 464 goto reset_unlock; 465 } 466 467 if (memchr_inv(udata + moff, 0, msize)) { 468 err = -EINVAL; 469 goto reset_unlock; 470 } 471 472 continue; 473 } 474 475 prog_fd = (int)(*(unsigned long *)(udata + moff)); 476 /* Similar check as the attr->attach_prog_fd */ 477 if (!prog_fd) 478 continue; 479 480 prog = bpf_prog_get(prog_fd); 481 if (IS_ERR(prog)) { 482 err = PTR_ERR(prog); 483 goto reset_unlock; 484 } 485 486 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 487 prog->aux->attach_btf_id != st_ops->type_id || 488 prog->expected_attach_type != i) { 489 bpf_prog_put(prog); 490 err = -EINVAL; 491 goto reset_unlock; 492 } 493 494 link = kzalloc(sizeof(*link), GFP_USER); 495 if (!link) { 496 bpf_prog_put(prog); 497 err = -ENOMEM; 498 goto reset_unlock; 499 } 500 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 501 &bpf_struct_ops_link_lops, prog); 502 st_map->links[i] = &link->link; 503 504 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 505 &st_ops->func_models[i], 506 *(void **)(st_ops->cfi_stubs + moff), 507 image, image_end); 508 if (err < 0) 509 goto reset_unlock; 510 511 *(void **)(kdata + moff) = image + cfi_get_offset(); 512 image += err; 513 514 /* put prog_id to udata */ 515 *(unsigned long *)(udata + moff) = prog->aux->id; 516 } 517 518 if (st_map->map.map_flags & BPF_F_LINK) { 519 err = 0; 520 if (st_ops->validate) { 521 err = st_ops->validate(kdata); 522 if (err) 523 goto reset_unlock; 524 } 525 arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); 526 /* Let bpf_link handle registration & unregistration. 527 * 528 * Pair with smp_load_acquire() during lookup_elem(). 529 */ 530 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); 531 goto unlock; 532 } 533 534 arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); 535 err = st_ops->reg(kdata); 536 if (likely(!err)) { 537 /* This refcnt increment on the map here after 538 * 'st_ops->reg()' is secure since the state of the 539 * map must be set to INIT at this moment, and thus 540 * bpf_struct_ops_map_delete_elem() can't unregister 541 * or transition it to TOBEFREE concurrently. 542 */ 543 bpf_map_inc(map); 544 /* Pair with smp_load_acquire() during lookup_elem(). 545 * It ensures the above udata updates (e.g. prog->aux->id) 546 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 547 */ 548 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 549 goto unlock; 550 } 551 552 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 553 * verified as a whole, after all init_member() calls. Can also happen if 554 * there was a race in registering the struct_ops (under the same name) to 555 * a sub-system through different struct_ops's maps. 556 */ 557 arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); 558 559 reset_unlock: 560 bpf_struct_ops_map_put_progs(st_map); 561 memset(uvalue, 0, map->value_size); 562 memset(kvalue, 0, map->value_size); 563 unlock: 564 kfree(tlinks); 565 mutex_unlock(&st_map->lock); 566 return err; 567 } 568 569 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 570 { 571 enum bpf_struct_ops_state prev_state; 572 struct bpf_struct_ops_map *st_map; 573 574 st_map = (struct bpf_struct_ops_map *)map; 575 if (st_map->map.map_flags & BPF_F_LINK) 576 return -EOPNOTSUPP; 577 578 prev_state = cmpxchg(&st_map->kvalue.state, 579 BPF_STRUCT_OPS_STATE_INUSE, 580 BPF_STRUCT_OPS_STATE_TOBEFREE); 581 switch (prev_state) { 582 case BPF_STRUCT_OPS_STATE_INUSE: 583 st_map->st_ops->unreg(&st_map->kvalue.data); 584 bpf_map_put(map); 585 return 0; 586 case BPF_STRUCT_OPS_STATE_TOBEFREE: 587 return -EINPROGRESS; 588 case BPF_STRUCT_OPS_STATE_INIT: 589 return -ENOENT; 590 default: 591 WARN_ON_ONCE(1); 592 /* Should never happen. Treat it as not found. */ 593 return -ENOENT; 594 } 595 } 596 597 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 598 struct seq_file *m) 599 { 600 void *value; 601 int err; 602 603 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 604 if (!value) 605 return; 606 607 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 608 if (!err) { 609 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 610 value, m); 611 seq_puts(m, "\n"); 612 } 613 614 kfree(value); 615 } 616 617 static void __bpf_struct_ops_map_free(struct bpf_map *map) 618 { 619 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 620 621 if (st_map->links) 622 bpf_struct_ops_map_put_progs(st_map); 623 bpf_map_area_free(st_map->links); 624 if (st_map->image) { 625 arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); 626 bpf_jit_uncharge_modmem(PAGE_SIZE); 627 } 628 bpf_map_area_free(st_map->uvalue); 629 bpf_map_area_free(st_map); 630 } 631 632 static void bpf_struct_ops_map_free(struct bpf_map *map) 633 { 634 /* The struct_ops's function may switch to another struct_ops. 635 * 636 * For example, bpf_tcp_cc_x->init() may switch to 637 * another tcp_cc_y by calling 638 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 639 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 640 * and its refcount may reach 0 which then free its 641 * trampoline image while tcp_cc_x is still running. 642 * 643 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 644 * to finish. bpf-tcp-cc prog is non sleepable. 645 * A rcu_tasks gp is to wait for the last few insn 646 * in the tramopline image to finish before releasing 647 * the trampoline image. 648 */ 649 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 650 651 __bpf_struct_ops_map_free(map); 652 } 653 654 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 655 { 656 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 657 (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) 658 return -EINVAL; 659 return 0; 660 } 661 662 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 663 { 664 const struct bpf_struct_ops *st_ops; 665 size_t st_map_size; 666 struct bpf_struct_ops_map *st_map; 667 const struct btf_type *t, *vt; 668 struct bpf_map *map; 669 int ret; 670 671 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 672 if (!st_ops) 673 return ERR_PTR(-ENOTSUPP); 674 675 vt = st_ops->value_type; 676 if (attr->value_size != vt->size) 677 return ERR_PTR(-EINVAL); 678 679 t = st_ops->type; 680 681 st_map_size = sizeof(*st_map) + 682 /* kvalue stores the 683 * struct bpf_struct_ops_tcp_congestions_ops 684 */ 685 (vt->size - sizeof(struct bpf_struct_ops_value)); 686 687 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 688 if (!st_map) 689 return ERR_PTR(-ENOMEM); 690 691 st_map->st_ops = st_ops; 692 map = &st_map->map; 693 694 ret = bpf_jit_charge_modmem(PAGE_SIZE); 695 if (ret) { 696 __bpf_struct_ops_map_free(map); 697 return ERR_PTR(ret); 698 } 699 700 st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); 701 if (!st_map->image) { 702 /* __bpf_struct_ops_map_free() uses st_map->image as flag 703 * for "charged or not". In this case, we need to unchange 704 * here. 705 */ 706 bpf_jit_uncharge_modmem(PAGE_SIZE); 707 __bpf_struct_ops_map_free(map); 708 return ERR_PTR(-ENOMEM); 709 } 710 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 711 st_map->links = 712 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), 713 NUMA_NO_NODE); 714 if (!st_map->uvalue || !st_map->links) { 715 __bpf_struct_ops_map_free(map); 716 return ERR_PTR(-ENOMEM); 717 } 718 719 mutex_init(&st_map->lock); 720 bpf_map_init_from_attr(map, attr); 721 722 return map; 723 } 724 725 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 726 { 727 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 728 const struct bpf_struct_ops *st_ops = st_map->st_ops; 729 const struct btf_type *vt = st_ops->value_type; 730 u64 usage; 731 732 usage = sizeof(*st_map) + 733 vt->size - sizeof(struct bpf_struct_ops_value); 734 usage += vt->size; 735 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 736 usage += PAGE_SIZE; 737 return usage; 738 } 739 740 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 741 const struct bpf_map_ops bpf_struct_ops_map_ops = { 742 .map_alloc_check = bpf_struct_ops_map_alloc_check, 743 .map_alloc = bpf_struct_ops_map_alloc, 744 .map_free = bpf_struct_ops_map_free, 745 .map_get_next_key = bpf_struct_ops_map_get_next_key, 746 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 747 .map_delete_elem = bpf_struct_ops_map_delete_elem, 748 .map_update_elem = bpf_struct_ops_map_update_elem, 749 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 750 .map_mem_usage = bpf_struct_ops_map_mem_usage, 751 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 752 }; 753 754 /* "const void *" because some subsystem is 755 * passing a const (e.g. const struct tcp_congestion_ops *) 756 */ 757 bool bpf_struct_ops_get(const void *kdata) 758 { 759 struct bpf_struct_ops_value *kvalue; 760 struct bpf_struct_ops_map *st_map; 761 struct bpf_map *map; 762 763 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 764 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 765 766 map = __bpf_map_inc_not_zero(&st_map->map, false); 767 return !IS_ERR(map); 768 } 769 770 void bpf_struct_ops_put(const void *kdata) 771 { 772 struct bpf_struct_ops_value *kvalue; 773 struct bpf_struct_ops_map *st_map; 774 775 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 776 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 777 778 bpf_map_put(&st_map->map); 779 } 780 781 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 782 { 783 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 784 785 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 786 map->map_flags & BPF_F_LINK && 787 /* Pair with smp_store_release() during map_update */ 788 smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; 789 } 790 791 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 792 { 793 struct bpf_struct_ops_link *st_link; 794 struct bpf_struct_ops_map *st_map; 795 796 st_link = container_of(link, struct bpf_struct_ops_link, link); 797 st_map = (struct bpf_struct_ops_map *) 798 rcu_dereference_protected(st_link->map, true); 799 if (st_map) { 800 /* st_link->map can be NULL if 801 * bpf_struct_ops_link_create() fails to register. 802 */ 803 st_map->st_ops->unreg(&st_map->kvalue.data); 804 bpf_map_put(&st_map->map); 805 } 806 kfree(st_link); 807 } 808 809 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 810 struct seq_file *seq) 811 { 812 struct bpf_struct_ops_link *st_link; 813 struct bpf_map *map; 814 815 st_link = container_of(link, struct bpf_struct_ops_link, link); 816 rcu_read_lock(); 817 map = rcu_dereference(st_link->map); 818 seq_printf(seq, "map_id:\t%d\n", map->id); 819 rcu_read_unlock(); 820 } 821 822 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 823 struct bpf_link_info *info) 824 { 825 struct bpf_struct_ops_link *st_link; 826 struct bpf_map *map; 827 828 st_link = container_of(link, struct bpf_struct_ops_link, link); 829 rcu_read_lock(); 830 map = rcu_dereference(st_link->map); 831 info->struct_ops.map_id = map->id; 832 rcu_read_unlock(); 833 return 0; 834 } 835 836 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 837 struct bpf_map *expected_old_map) 838 { 839 struct bpf_struct_ops_map *st_map, *old_st_map; 840 struct bpf_map *old_map; 841 struct bpf_struct_ops_link *st_link; 842 int err; 843 844 st_link = container_of(link, struct bpf_struct_ops_link, link); 845 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 846 847 if (!bpf_struct_ops_valid_to_reg(new_map)) 848 return -EINVAL; 849 850 if (!st_map->st_ops->update) 851 return -EOPNOTSUPP; 852 853 mutex_lock(&update_mutex); 854 855 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 856 if (expected_old_map && old_map != expected_old_map) { 857 err = -EPERM; 858 goto err_out; 859 } 860 861 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 862 /* The new and old struct_ops must be the same type. */ 863 if (st_map->st_ops != old_st_map->st_ops) { 864 err = -EINVAL; 865 goto err_out; 866 } 867 868 err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); 869 if (err) 870 goto err_out; 871 872 bpf_map_inc(new_map); 873 rcu_assign_pointer(st_link->map, new_map); 874 bpf_map_put(old_map); 875 876 err_out: 877 mutex_unlock(&update_mutex); 878 879 return err; 880 } 881 882 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 883 .dealloc = bpf_struct_ops_map_link_dealloc, 884 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 885 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 886 .update_map = bpf_struct_ops_map_link_update, 887 }; 888 889 int bpf_struct_ops_link_create(union bpf_attr *attr) 890 { 891 struct bpf_struct_ops_link *link = NULL; 892 struct bpf_link_primer link_primer; 893 struct bpf_struct_ops_map *st_map; 894 struct bpf_map *map; 895 int err; 896 897 map = bpf_map_get(attr->link_create.map_fd); 898 if (IS_ERR(map)) 899 return PTR_ERR(map); 900 901 st_map = (struct bpf_struct_ops_map *)map; 902 903 if (!bpf_struct_ops_valid_to_reg(map)) { 904 err = -EINVAL; 905 goto err_out; 906 } 907 908 link = kzalloc(sizeof(*link), GFP_USER); 909 if (!link) { 910 err = -ENOMEM; 911 goto err_out; 912 } 913 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 914 915 err = bpf_link_prime(&link->link, &link_primer); 916 if (err) 917 goto err_out; 918 919 err = st_map->st_ops->reg(st_map->kvalue.data); 920 if (err) { 921 bpf_link_cleanup(&link_primer); 922 link = NULL; 923 goto err_out; 924 } 925 RCU_INIT_POINTER(link->map, map); 926 927 return bpf_link_settle(&link_primer); 928 929 err_out: 930 bpf_map_put(map); 931 kfree(link); 932 return err; 933 } 934