1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 16 enum bpf_struct_ops_state { 17 BPF_STRUCT_OPS_STATE_INIT, 18 BPF_STRUCT_OPS_STATE_INUSE, 19 BPF_STRUCT_OPS_STATE_TOBEFREE, 20 BPF_STRUCT_OPS_STATE_READY, 21 }; 22 23 #define BPF_STRUCT_OPS_COMMON_VALUE \ 24 refcount_t refcnt; \ 25 enum bpf_struct_ops_state state 26 27 struct bpf_struct_ops_value { 28 BPF_STRUCT_OPS_COMMON_VALUE; 29 char data[] ____cacheline_aligned_in_smp; 30 }; 31 32 struct bpf_struct_ops_map { 33 struct bpf_map map; 34 struct rcu_head rcu; 35 const struct bpf_struct_ops *st_ops; 36 /* protect map_update */ 37 struct mutex lock; 38 /* link has all the bpf_links that is populated 39 * to the func ptr of the kernel's struct 40 * (in kvalue.data). 41 */ 42 struct bpf_link **links; 43 /* image is a page that has all the trampolines 44 * that stores the func args before calling the bpf_prog. 45 * A PAGE_SIZE "image" is enough to store all trampoline for 46 * "links[]". 47 */ 48 void *image; 49 /* uvalue->data stores the kernel struct 50 * (e.g. tcp_congestion_ops) that is more useful 51 * to userspace than the kvalue. For example, 52 * the bpf_prog's id is stored instead of the kernel 53 * address of a func ptr. 54 */ 55 struct bpf_struct_ops_value *uvalue; 56 /* kvalue.data stores the actual kernel's struct 57 * (e.g. tcp_congestion_ops) that will be 58 * registered to the kernel subsystem. 59 */ 60 struct bpf_struct_ops_value kvalue; 61 }; 62 63 struct bpf_struct_ops_link { 64 struct bpf_link link; 65 struct bpf_map __rcu *map; 66 }; 67 68 static DEFINE_MUTEX(update_mutex); 69 70 #define VALUE_PREFIX "bpf_struct_ops_" 71 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 72 73 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 74 * the map's value exposed to the userspace and its btf-type-id is 75 * stored at the map->btf_vmlinux_value_type_id. 76 * 77 */ 78 #define BPF_STRUCT_OPS_TYPE(_name) \ 79 extern struct bpf_struct_ops bpf_##_name; \ 80 \ 81 struct bpf_struct_ops_##_name { \ 82 BPF_STRUCT_OPS_COMMON_VALUE; \ 83 struct _name data ____cacheline_aligned_in_smp; \ 84 }; 85 #include "bpf_struct_ops_types.h" 86 #undef BPF_STRUCT_OPS_TYPE 87 88 enum { 89 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 90 #include "bpf_struct_ops_types.h" 91 #undef BPF_STRUCT_OPS_TYPE 92 __NR_BPF_STRUCT_OPS_TYPE, 93 }; 94 95 static struct bpf_struct_ops * const bpf_struct_ops[] = { 96 #define BPF_STRUCT_OPS_TYPE(_name) \ 97 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 98 #include "bpf_struct_ops_types.h" 99 #undef BPF_STRUCT_OPS_TYPE 100 }; 101 102 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 103 }; 104 105 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 106 #ifdef CONFIG_NET 107 .test_run = bpf_struct_ops_test_run, 108 #endif 109 }; 110 111 static const struct btf_type *module_type; 112 113 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 114 { 115 s32 type_id, value_id, module_id; 116 const struct btf_member *member; 117 struct bpf_struct_ops *st_ops; 118 const struct btf_type *t; 119 char value_name[128]; 120 const char *mname; 121 u32 i, j; 122 123 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 124 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 125 #include "bpf_struct_ops_types.h" 126 #undef BPF_STRUCT_OPS_TYPE 127 128 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 129 if (module_id < 0) { 130 pr_warn("Cannot find struct module in btf_vmlinux\n"); 131 return; 132 } 133 module_type = btf_type_by_id(btf, module_id); 134 135 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 136 st_ops = bpf_struct_ops[i]; 137 138 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 139 sizeof(value_name)) { 140 pr_warn("struct_ops name %s is too long\n", 141 st_ops->name); 142 continue; 143 } 144 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 145 146 value_id = btf_find_by_name_kind(btf, value_name, 147 BTF_KIND_STRUCT); 148 if (value_id < 0) { 149 pr_warn("Cannot find struct %s in btf_vmlinux\n", 150 value_name); 151 continue; 152 } 153 154 type_id = btf_find_by_name_kind(btf, st_ops->name, 155 BTF_KIND_STRUCT); 156 if (type_id < 0) { 157 pr_warn("Cannot find struct %s in btf_vmlinux\n", 158 st_ops->name); 159 continue; 160 } 161 t = btf_type_by_id(btf, type_id); 162 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 163 pr_warn("Cannot support #%u members in struct %s\n", 164 btf_type_vlen(t), st_ops->name); 165 continue; 166 } 167 168 for_each_member(j, t, member) { 169 const struct btf_type *func_proto; 170 171 mname = btf_name_by_offset(btf, member->name_off); 172 if (!*mname) { 173 pr_warn("anon member in struct %s is not supported\n", 174 st_ops->name); 175 break; 176 } 177 178 if (__btf_member_bitfield_size(t, member)) { 179 pr_warn("bit field member %s in struct %s is not supported\n", 180 mname, st_ops->name); 181 break; 182 } 183 184 func_proto = btf_type_resolve_func_ptr(btf, 185 member->type, 186 NULL); 187 if (func_proto && 188 btf_distill_func_proto(log, btf, 189 func_proto, mname, 190 &st_ops->func_models[j])) { 191 pr_warn("Error in parsing func ptr %s in struct %s\n", 192 mname, st_ops->name); 193 break; 194 } 195 } 196 197 if (j == btf_type_vlen(t)) { 198 if (st_ops->init(btf)) { 199 pr_warn("Error in init bpf_struct_ops %s\n", 200 st_ops->name); 201 } else { 202 st_ops->type_id = type_id; 203 st_ops->type = t; 204 st_ops->value_id = value_id; 205 st_ops->value_type = btf_type_by_id(btf, 206 value_id); 207 } 208 } 209 } 210 } 211 212 extern struct btf *btf_vmlinux; 213 214 static const struct bpf_struct_ops * 215 bpf_struct_ops_find_value(u32 value_id) 216 { 217 unsigned int i; 218 219 if (!value_id || !btf_vmlinux) 220 return NULL; 221 222 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 223 if (bpf_struct_ops[i]->value_id == value_id) 224 return bpf_struct_ops[i]; 225 } 226 227 return NULL; 228 } 229 230 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 231 { 232 unsigned int i; 233 234 if (!type_id || !btf_vmlinux) 235 return NULL; 236 237 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 238 if (bpf_struct_ops[i]->type_id == type_id) 239 return bpf_struct_ops[i]; 240 } 241 242 return NULL; 243 } 244 245 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 246 void *next_key) 247 { 248 if (key && *(u32 *)key == 0) 249 return -ENOENT; 250 251 *(u32 *)next_key = 0; 252 return 0; 253 } 254 255 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 256 void *value) 257 { 258 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 259 struct bpf_struct_ops_value *uvalue, *kvalue; 260 enum bpf_struct_ops_state state; 261 s64 refcnt; 262 263 if (unlikely(*(u32 *)key != 0)) 264 return -ENOENT; 265 266 kvalue = &st_map->kvalue; 267 /* Pair with smp_store_release() during map_update */ 268 state = smp_load_acquire(&kvalue->state); 269 if (state == BPF_STRUCT_OPS_STATE_INIT) { 270 memset(value, 0, map->value_size); 271 return 0; 272 } 273 274 /* No lock is needed. state and refcnt do not need 275 * to be updated together under atomic context. 276 */ 277 uvalue = value; 278 memcpy(uvalue, st_map->uvalue, map->value_size); 279 uvalue->state = state; 280 281 /* This value offers the user space a general estimate of how 282 * many sockets are still utilizing this struct_ops for TCP 283 * congestion control. The number might not be exact, but it 284 * should sufficiently meet our present goals. 285 */ 286 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 287 refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); 288 289 return 0; 290 } 291 292 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 293 { 294 return ERR_PTR(-EINVAL); 295 } 296 297 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 298 { 299 const struct btf_type *t = st_map->st_ops->type; 300 u32 i; 301 302 for (i = 0; i < btf_type_vlen(t); i++) { 303 if (st_map->links[i]) { 304 bpf_link_put(st_map->links[i]); 305 st_map->links[i] = NULL; 306 } 307 } 308 } 309 310 static int check_zero_holes(const struct btf_type *t, void *data) 311 { 312 const struct btf_member *member; 313 u32 i, moff, msize, prev_mend = 0; 314 const struct btf_type *mtype; 315 316 for_each_member(i, t, member) { 317 moff = __btf_member_bit_offset(t, member) / 8; 318 if (moff > prev_mend && 319 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 320 return -EINVAL; 321 322 mtype = btf_type_by_id(btf_vmlinux, member->type); 323 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 324 if (IS_ERR(mtype)) 325 return PTR_ERR(mtype); 326 prev_mend = moff + msize; 327 } 328 329 if (t->size > prev_mend && 330 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 331 return -EINVAL; 332 333 return 0; 334 } 335 336 static void bpf_struct_ops_link_release(struct bpf_link *link) 337 { 338 } 339 340 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 341 { 342 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 343 344 kfree(tlink); 345 } 346 347 const struct bpf_link_ops bpf_struct_ops_link_lops = { 348 .release = bpf_struct_ops_link_release, 349 .dealloc = bpf_struct_ops_link_dealloc, 350 }; 351 352 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 353 struct bpf_tramp_link *link, 354 const struct btf_func_model *model, 355 void *image, void *image_end) 356 { 357 u32 flags; 358 359 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 360 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 361 /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, 362 * and it must be used alone. 363 */ 364 flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; 365 return arch_prepare_bpf_trampoline(NULL, image, image_end, 366 model, flags, tlinks, NULL); 367 } 368 369 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 370 void *value, u64 flags) 371 { 372 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 373 const struct bpf_struct_ops *st_ops = st_map->st_ops; 374 struct bpf_struct_ops_value *uvalue, *kvalue; 375 const struct btf_member *member; 376 const struct btf_type *t = st_ops->type; 377 struct bpf_tramp_links *tlinks; 378 void *udata, *kdata; 379 int prog_fd, err; 380 void *image, *image_end; 381 u32 i; 382 383 if (flags) 384 return -EINVAL; 385 386 if (*(u32 *)key != 0) 387 return -E2BIG; 388 389 err = check_zero_holes(st_ops->value_type, value); 390 if (err) 391 return err; 392 393 uvalue = value; 394 err = check_zero_holes(t, uvalue->data); 395 if (err) 396 return err; 397 398 if (uvalue->state || refcount_read(&uvalue->refcnt)) 399 return -EINVAL; 400 401 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 402 if (!tlinks) 403 return -ENOMEM; 404 405 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 406 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 407 408 mutex_lock(&st_map->lock); 409 410 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 411 err = -EBUSY; 412 goto unlock; 413 } 414 415 memcpy(uvalue, value, map->value_size); 416 417 udata = &uvalue->data; 418 kdata = &kvalue->data; 419 image = st_map->image; 420 image_end = st_map->image + PAGE_SIZE; 421 422 for_each_member(i, t, member) { 423 const struct btf_type *mtype, *ptype; 424 struct bpf_prog *prog; 425 struct bpf_tramp_link *link; 426 u32 moff; 427 428 moff = __btf_member_bit_offset(t, member) / 8; 429 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 430 if (ptype == module_type) { 431 if (*(void **)(udata + moff)) 432 goto reset_unlock; 433 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 434 continue; 435 } 436 437 err = st_ops->init_member(t, member, kdata, udata); 438 if (err < 0) 439 goto reset_unlock; 440 441 /* The ->init_member() has handled this member */ 442 if (err > 0) 443 continue; 444 445 /* If st_ops->init_member does not handle it, 446 * we will only handle func ptrs and zero-ed members 447 * here. Reject everything else. 448 */ 449 450 /* All non func ptr member must be 0 */ 451 if (!ptype || !btf_type_is_func_proto(ptype)) { 452 u32 msize; 453 454 mtype = btf_type_by_id(btf_vmlinux, member->type); 455 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 456 if (IS_ERR(mtype)) { 457 err = PTR_ERR(mtype); 458 goto reset_unlock; 459 } 460 461 if (memchr_inv(udata + moff, 0, msize)) { 462 err = -EINVAL; 463 goto reset_unlock; 464 } 465 466 continue; 467 } 468 469 prog_fd = (int)(*(unsigned long *)(udata + moff)); 470 /* Similar check as the attr->attach_prog_fd */ 471 if (!prog_fd) 472 continue; 473 474 prog = bpf_prog_get(prog_fd); 475 if (IS_ERR(prog)) { 476 err = PTR_ERR(prog); 477 goto reset_unlock; 478 } 479 480 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 481 prog->aux->attach_btf_id != st_ops->type_id || 482 prog->expected_attach_type != i) { 483 bpf_prog_put(prog); 484 err = -EINVAL; 485 goto reset_unlock; 486 } 487 488 link = kzalloc(sizeof(*link), GFP_USER); 489 if (!link) { 490 bpf_prog_put(prog); 491 err = -ENOMEM; 492 goto reset_unlock; 493 } 494 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 495 &bpf_struct_ops_link_lops, prog); 496 st_map->links[i] = &link->link; 497 498 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 499 &st_ops->func_models[i], 500 image, image_end); 501 if (err < 0) 502 goto reset_unlock; 503 504 *(void **)(kdata + moff) = image; 505 image += err; 506 507 /* put prog_id to udata */ 508 *(unsigned long *)(udata + moff) = prog->aux->id; 509 } 510 511 if (st_map->map.map_flags & BPF_F_LINK) { 512 err = 0; 513 if (st_ops->validate) { 514 err = st_ops->validate(kdata); 515 if (err) 516 goto reset_unlock; 517 } 518 set_memory_rox((long)st_map->image, 1); 519 /* Let bpf_link handle registration & unregistration. 520 * 521 * Pair with smp_load_acquire() during lookup_elem(). 522 */ 523 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); 524 goto unlock; 525 } 526 527 set_memory_rox((long)st_map->image, 1); 528 err = st_ops->reg(kdata); 529 if (likely(!err)) { 530 /* This refcnt increment on the map here after 531 * 'st_ops->reg()' is secure since the state of the 532 * map must be set to INIT at this moment, and thus 533 * bpf_struct_ops_map_delete_elem() can't unregister 534 * or transition it to TOBEFREE concurrently. 535 */ 536 bpf_map_inc(map); 537 /* Pair with smp_load_acquire() during lookup_elem(). 538 * It ensures the above udata updates (e.g. prog->aux->id) 539 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 540 */ 541 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 542 goto unlock; 543 } 544 545 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 546 * verified as a whole, after all init_member() calls. Can also happen if 547 * there was a race in registering the struct_ops (under the same name) to 548 * a sub-system through different struct_ops's maps. 549 */ 550 set_memory_nx((long)st_map->image, 1); 551 set_memory_rw((long)st_map->image, 1); 552 553 reset_unlock: 554 bpf_struct_ops_map_put_progs(st_map); 555 memset(uvalue, 0, map->value_size); 556 memset(kvalue, 0, map->value_size); 557 unlock: 558 kfree(tlinks); 559 mutex_unlock(&st_map->lock); 560 return err; 561 } 562 563 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 564 { 565 enum bpf_struct_ops_state prev_state; 566 struct bpf_struct_ops_map *st_map; 567 568 st_map = (struct bpf_struct_ops_map *)map; 569 if (st_map->map.map_flags & BPF_F_LINK) 570 return -EOPNOTSUPP; 571 572 prev_state = cmpxchg(&st_map->kvalue.state, 573 BPF_STRUCT_OPS_STATE_INUSE, 574 BPF_STRUCT_OPS_STATE_TOBEFREE); 575 switch (prev_state) { 576 case BPF_STRUCT_OPS_STATE_INUSE: 577 st_map->st_ops->unreg(&st_map->kvalue.data); 578 bpf_map_put(map); 579 return 0; 580 case BPF_STRUCT_OPS_STATE_TOBEFREE: 581 return -EINPROGRESS; 582 case BPF_STRUCT_OPS_STATE_INIT: 583 return -ENOENT; 584 default: 585 WARN_ON_ONCE(1); 586 /* Should never happen. Treat it as not found. */ 587 return -ENOENT; 588 } 589 } 590 591 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 592 struct seq_file *m) 593 { 594 void *value; 595 int err; 596 597 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 598 if (!value) 599 return; 600 601 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 602 if (!err) { 603 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 604 value, m); 605 seq_puts(m, "\n"); 606 } 607 608 kfree(value); 609 } 610 611 static void __bpf_struct_ops_map_free(struct bpf_map *map) 612 { 613 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 614 615 if (st_map->links) 616 bpf_struct_ops_map_put_progs(st_map); 617 bpf_map_area_free(st_map->links); 618 bpf_jit_free_exec(st_map->image); 619 bpf_map_area_free(st_map->uvalue); 620 bpf_map_area_free(st_map); 621 } 622 623 static void bpf_struct_ops_map_free(struct bpf_map *map) 624 { 625 /* The struct_ops's function may switch to another struct_ops. 626 * 627 * For example, bpf_tcp_cc_x->init() may switch to 628 * another tcp_cc_y by calling 629 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 630 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 631 * and its refcount may reach 0 which then free its 632 * trampoline image while tcp_cc_x is still running. 633 * 634 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 635 * to finish. bpf-tcp-cc prog is non sleepable. 636 * A rcu_tasks gp is to wait for the last few insn 637 * in the tramopline image to finish before releasing 638 * the trampoline image. 639 */ 640 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 641 642 __bpf_struct_ops_map_free(map); 643 } 644 645 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 646 { 647 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 648 (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) 649 return -EINVAL; 650 return 0; 651 } 652 653 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 654 { 655 const struct bpf_struct_ops *st_ops; 656 size_t st_map_size; 657 struct bpf_struct_ops_map *st_map; 658 const struct btf_type *t, *vt; 659 struct bpf_map *map; 660 661 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 662 if (!st_ops) 663 return ERR_PTR(-ENOTSUPP); 664 665 vt = st_ops->value_type; 666 if (attr->value_size != vt->size) 667 return ERR_PTR(-EINVAL); 668 669 t = st_ops->type; 670 671 st_map_size = sizeof(*st_map) + 672 /* kvalue stores the 673 * struct bpf_struct_ops_tcp_congestions_ops 674 */ 675 (vt->size - sizeof(struct bpf_struct_ops_value)); 676 677 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 678 if (!st_map) 679 return ERR_PTR(-ENOMEM); 680 681 st_map->st_ops = st_ops; 682 map = &st_map->map; 683 684 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 685 st_map->links = 686 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), 687 NUMA_NO_NODE); 688 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 689 if (!st_map->uvalue || !st_map->links || !st_map->image) { 690 __bpf_struct_ops_map_free(map); 691 return ERR_PTR(-ENOMEM); 692 } 693 694 mutex_init(&st_map->lock); 695 set_vm_flush_reset_perms(st_map->image); 696 bpf_map_init_from_attr(map, attr); 697 698 return map; 699 } 700 701 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 702 { 703 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 704 const struct bpf_struct_ops *st_ops = st_map->st_ops; 705 const struct btf_type *vt = st_ops->value_type; 706 u64 usage; 707 708 usage = sizeof(*st_map) + 709 vt->size - sizeof(struct bpf_struct_ops_value); 710 usage += vt->size; 711 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 712 usage += PAGE_SIZE; 713 return usage; 714 } 715 716 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 717 const struct bpf_map_ops bpf_struct_ops_map_ops = { 718 .map_alloc_check = bpf_struct_ops_map_alloc_check, 719 .map_alloc = bpf_struct_ops_map_alloc, 720 .map_free = bpf_struct_ops_map_free, 721 .map_get_next_key = bpf_struct_ops_map_get_next_key, 722 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 723 .map_delete_elem = bpf_struct_ops_map_delete_elem, 724 .map_update_elem = bpf_struct_ops_map_update_elem, 725 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 726 .map_mem_usage = bpf_struct_ops_map_mem_usage, 727 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 728 }; 729 730 /* "const void *" because some subsystem is 731 * passing a const (e.g. const struct tcp_congestion_ops *) 732 */ 733 bool bpf_struct_ops_get(const void *kdata) 734 { 735 struct bpf_struct_ops_value *kvalue; 736 struct bpf_struct_ops_map *st_map; 737 struct bpf_map *map; 738 739 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 740 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 741 742 map = __bpf_map_inc_not_zero(&st_map->map, false); 743 return !IS_ERR(map); 744 } 745 746 void bpf_struct_ops_put(const void *kdata) 747 { 748 struct bpf_struct_ops_value *kvalue; 749 struct bpf_struct_ops_map *st_map; 750 751 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 752 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 753 754 bpf_map_put(&st_map->map); 755 } 756 757 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 758 { 759 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 760 761 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 762 map->map_flags & BPF_F_LINK && 763 /* Pair with smp_store_release() during map_update */ 764 smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; 765 } 766 767 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 768 { 769 struct bpf_struct_ops_link *st_link; 770 struct bpf_struct_ops_map *st_map; 771 772 st_link = container_of(link, struct bpf_struct_ops_link, link); 773 st_map = (struct bpf_struct_ops_map *) 774 rcu_dereference_protected(st_link->map, true); 775 if (st_map) { 776 /* st_link->map can be NULL if 777 * bpf_struct_ops_link_create() fails to register. 778 */ 779 st_map->st_ops->unreg(&st_map->kvalue.data); 780 bpf_map_put(&st_map->map); 781 } 782 kfree(st_link); 783 } 784 785 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 786 struct seq_file *seq) 787 { 788 struct bpf_struct_ops_link *st_link; 789 struct bpf_map *map; 790 791 st_link = container_of(link, struct bpf_struct_ops_link, link); 792 rcu_read_lock(); 793 map = rcu_dereference(st_link->map); 794 seq_printf(seq, "map_id:\t%d\n", map->id); 795 rcu_read_unlock(); 796 } 797 798 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 799 struct bpf_link_info *info) 800 { 801 struct bpf_struct_ops_link *st_link; 802 struct bpf_map *map; 803 804 st_link = container_of(link, struct bpf_struct_ops_link, link); 805 rcu_read_lock(); 806 map = rcu_dereference(st_link->map); 807 info->struct_ops.map_id = map->id; 808 rcu_read_unlock(); 809 return 0; 810 } 811 812 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 813 struct bpf_map *expected_old_map) 814 { 815 struct bpf_struct_ops_map *st_map, *old_st_map; 816 struct bpf_map *old_map; 817 struct bpf_struct_ops_link *st_link; 818 int err; 819 820 st_link = container_of(link, struct bpf_struct_ops_link, link); 821 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 822 823 if (!bpf_struct_ops_valid_to_reg(new_map)) 824 return -EINVAL; 825 826 if (!st_map->st_ops->update) 827 return -EOPNOTSUPP; 828 829 mutex_lock(&update_mutex); 830 831 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 832 if (expected_old_map && old_map != expected_old_map) { 833 err = -EPERM; 834 goto err_out; 835 } 836 837 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 838 /* The new and old struct_ops must be the same type. */ 839 if (st_map->st_ops != old_st_map->st_ops) { 840 err = -EINVAL; 841 goto err_out; 842 } 843 844 err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); 845 if (err) 846 goto err_out; 847 848 bpf_map_inc(new_map); 849 rcu_assign_pointer(st_link->map, new_map); 850 bpf_map_put(old_map); 851 852 err_out: 853 mutex_unlock(&update_mutex); 854 855 return err; 856 } 857 858 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 859 .dealloc = bpf_struct_ops_map_link_dealloc, 860 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 861 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 862 .update_map = bpf_struct_ops_map_link_update, 863 }; 864 865 int bpf_struct_ops_link_create(union bpf_attr *attr) 866 { 867 struct bpf_struct_ops_link *link = NULL; 868 struct bpf_link_primer link_primer; 869 struct bpf_struct_ops_map *st_map; 870 struct bpf_map *map; 871 int err; 872 873 map = bpf_map_get(attr->link_create.map_fd); 874 if (IS_ERR(map)) 875 return PTR_ERR(map); 876 877 st_map = (struct bpf_struct_ops_map *)map; 878 879 if (!bpf_struct_ops_valid_to_reg(map)) { 880 err = -EINVAL; 881 goto err_out; 882 } 883 884 link = kzalloc(sizeof(*link), GFP_USER); 885 if (!link) { 886 err = -ENOMEM; 887 goto err_out; 888 } 889 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 890 891 err = bpf_link_prime(&link->link, &link_primer); 892 if (err) 893 goto err_out; 894 895 err = st_map->st_ops->reg(st_map->kvalue.data); 896 if (err) { 897 bpf_link_cleanup(&link_primer); 898 link = NULL; 899 goto err_out; 900 } 901 RCU_INIT_POINTER(link->map, map); 902 903 return bpf_link_settle(&link_primer); 904 905 err_out: 906 bpf_map_put(map); 907 kfree(link); 908 return err; 909 } 910 911