1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 4 #include <linux/bpf.h> 5 #include <linux/bpf_verifier.h> 6 #include <linux/btf.h> 7 #include <linux/filter.h> 8 #include <linux/slab.h> 9 #include <linux/numa.h> 10 #include <linux/seq_file.h> 11 #include <linux/refcount.h> 12 #include <linux/mutex.h> 13 #include <linux/btf_ids.h> 14 #include <linux/rcupdate_wait.h> 15 16 enum bpf_struct_ops_state { 17 BPF_STRUCT_OPS_STATE_INIT, 18 BPF_STRUCT_OPS_STATE_INUSE, 19 BPF_STRUCT_OPS_STATE_TOBEFREE, 20 BPF_STRUCT_OPS_STATE_READY, 21 }; 22 23 #define BPF_STRUCT_OPS_COMMON_VALUE \ 24 refcount_t refcnt; \ 25 enum bpf_struct_ops_state state 26 27 struct bpf_struct_ops_value { 28 BPF_STRUCT_OPS_COMMON_VALUE; 29 char data[] ____cacheline_aligned_in_smp; 30 }; 31 32 struct bpf_struct_ops_map { 33 struct bpf_map map; 34 struct rcu_head rcu; 35 const struct bpf_struct_ops *st_ops; 36 /* protect map_update */ 37 struct mutex lock; 38 /* link has all the bpf_links that is populated 39 * to the func ptr of the kernel's struct 40 * (in kvalue.data). 41 */ 42 struct bpf_link **links; 43 /* image is a page that has all the trampolines 44 * that stores the func args before calling the bpf_prog. 45 * A PAGE_SIZE "image" is enough to store all trampoline for 46 * "links[]". 47 */ 48 void *image; 49 /* uvalue->data stores the kernel struct 50 * (e.g. tcp_congestion_ops) that is more useful 51 * to userspace than the kvalue. For example, 52 * the bpf_prog's id is stored instead of the kernel 53 * address of a func ptr. 54 */ 55 struct bpf_struct_ops_value *uvalue; 56 /* kvalue.data stores the actual kernel's struct 57 * (e.g. tcp_congestion_ops) that will be 58 * registered to the kernel subsystem. 59 */ 60 struct bpf_struct_ops_value kvalue; 61 }; 62 63 struct bpf_struct_ops_link { 64 struct bpf_link link; 65 struct bpf_map __rcu *map; 66 }; 67 68 static DEFINE_MUTEX(update_mutex); 69 70 #define VALUE_PREFIX "bpf_struct_ops_" 71 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) 72 73 /* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is 74 * the map's value exposed to the userspace and its btf-type-id is 75 * stored at the map->btf_vmlinux_value_type_id. 76 * 77 */ 78 #define BPF_STRUCT_OPS_TYPE(_name) \ 79 extern struct bpf_struct_ops bpf_##_name; \ 80 \ 81 struct bpf_struct_ops_##_name { \ 82 BPF_STRUCT_OPS_COMMON_VALUE; \ 83 struct _name data ____cacheline_aligned_in_smp; \ 84 }; 85 #include "bpf_struct_ops_types.h" 86 #undef BPF_STRUCT_OPS_TYPE 87 88 enum { 89 #define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, 90 #include "bpf_struct_ops_types.h" 91 #undef BPF_STRUCT_OPS_TYPE 92 __NR_BPF_STRUCT_OPS_TYPE, 93 }; 94 95 static struct bpf_struct_ops * const bpf_struct_ops[] = { 96 #define BPF_STRUCT_OPS_TYPE(_name) \ 97 [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, 98 #include "bpf_struct_ops_types.h" 99 #undef BPF_STRUCT_OPS_TYPE 100 }; 101 102 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { 103 }; 104 105 const struct bpf_prog_ops bpf_struct_ops_prog_ops = { 106 #ifdef CONFIG_NET 107 .test_run = bpf_struct_ops_test_run, 108 #endif 109 }; 110 111 static const struct btf_type *module_type; 112 113 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) 114 { 115 s32 type_id, value_id, module_id; 116 const struct btf_member *member; 117 struct bpf_struct_ops *st_ops; 118 const struct btf_type *t; 119 char value_name[128]; 120 const char *mname; 121 u32 i, j; 122 123 /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ 124 #define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); 125 #include "bpf_struct_ops_types.h" 126 #undef BPF_STRUCT_OPS_TYPE 127 128 module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); 129 if (module_id < 0) { 130 pr_warn("Cannot find struct module in btf_vmlinux\n"); 131 return; 132 } 133 module_type = btf_type_by_id(btf, module_id); 134 135 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 136 st_ops = bpf_struct_ops[i]; 137 138 if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= 139 sizeof(value_name)) { 140 pr_warn("struct_ops name %s is too long\n", 141 st_ops->name); 142 continue; 143 } 144 sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); 145 146 value_id = btf_find_by_name_kind(btf, value_name, 147 BTF_KIND_STRUCT); 148 if (value_id < 0) { 149 pr_warn("Cannot find struct %s in btf_vmlinux\n", 150 value_name); 151 continue; 152 } 153 154 type_id = btf_find_by_name_kind(btf, st_ops->name, 155 BTF_KIND_STRUCT); 156 if (type_id < 0) { 157 pr_warn("Cannot find struct %s in btf_vmlinux\n", 158 st_ops->name); 159 continue; 160 } 161 t = btf_type_by_id(btf, type_id); 162 if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { 163 pr_warn("Cannot support #%u members in struct %s\n", 164 btf_type_vlen(t), st_ops->name); 165 continue; 166 } 167 168 for_each_member(j, t, member) { 169 const struct btf_type *func_proto; 170 171 mname = btf_name_by_offset(btf, member->name_off); 172 if (!*mname) { 173 pr_warn("anon member in struct %s is not supported\n", 174 st_ops->name); 175 break; 176 } 177 178 if (__btf_member_bitfield_size(t, member)) { 179 pr_warn("bit field member %s in struct %s is not supported\n", 180 mname, st_ops->name); 181 break; 182 } 183 184 func_proto = btf_type_resolve_func_ptr(btf, 185 member->type, 186 NULL); 187 if (func_proto && 188 btf_distill_func_proto(log, btf, 189 func_proto, mname, 190 &st_ops->func_models[j])) { 191 pr_warn("Error in parsing func ptr %s in struct %s\n", 192 mname, st_ops->name); 193 break; 194 } 195 } 196 197 if (j == btf_type_vlen(t)) { 198 if (st_ops->init(btf)) { 199 pr_warn("Error in init bpf_struct_ops %s\n", 200 st_ops->name); 201 } else { 202 st_ops->type_id = type_id; 203 st_ops->type = t; 204 st_ops->value_id = value_id; 205 st_ops->value_type = btf_type_by_id(btf, 206 value_id); 207 } 208 } 209 } 210 } 211 212 extern struct btf *btf_vmlinux; 213 214 static const struct bpf_struct_ops * 215 bpf_struct_ops_find_value(u32 value_id) 216 { 217 unsigned int i; 218 219 if (!value_id || !btf_vmlinux) 220 return NULL; 221 222 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 223 if (bpf_struct_ops[i]->value_id == value_id) 224 return bpf_struct_ops[i]; 225 } 226 227 return NULL; 228 } 229 230 const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) 231 { 232 unsigned int i; 233 234 if (!type_id || !btf_vmlinux) 235 return NULL; 236 237 for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { 238 if (bpf_struct_ops[i]->type_id == type_id) 239 return bpf_struct_ops[i]; 240 } 241 242 return NULL; 243 } 244 245 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, 246 void *next_key) 247 { 248 if (key && *(u32 *)key == 0) 249 return -ENOENT; 250 251 *(u32 *)next_key = 0; 252 return 0; 253 } 254 255 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, 256 void *value) 257 { 258 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 259 struct bpf_struct_ops_value *uvalue, *kvalue; 260 enum bpf_struct_ops_state state; 261 s64 refcnt; 262 263 if (unlikely(*(u32 *)key != 0)) 264 return -ENOENT; 265 266 kvalue = &st_map->kvalue; 267 /* Pair with smp_store_release() during map_update */ 268 state = smp_load_acquire(&kvalue->state); 269 if (state == BPF_STRUCT_OPS_STATE_INIT) { 270 memset(value, 0, map->value_size); 271 return 0; 272 } 273 274 /* No lock is needed. state and refcnt do not need 275 * to be updated together under atomic context. 276 */ 277 uvalue = value; 278 memcpy(uvalue, st_map->uvalue, map->value_size); 279 uvalue->state = state; 280 281 /* This value offers the user space a general estimate of how 282 * many sockets are still utilizing this struct_ops for TCP 283 * congestion control. The number might not be exact, but it 284 * should sufficiently meet our present goals. 285 */ 286 refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); 287 refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); 288 289 return 0; 290 } 291 292 static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) 293 { 294 return ERR_PTR(-EINVAL); 295 } 296 297 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) 298 { 299 const struct btf_type *t = st_map->st_ops->type; 300 u32 i; 301 302 for (i = 0; i < btf_type_vlen(t); i++) { 303 if (st_map->links[i]) { 304 bpf_link_put(st_map->links[i]); 305 st_map->links[i] = NULL; 306 } 307 } 308 } 309 310 static int check_zero_holes(const struct btf_type *t, void *data) 311 { 312 const struct btf_member *member; 313 u32 i, moff, msize, prev_mend = 0; 314 const struct btf_type *mtype; 315 316 for_each_member(i, t, member) { 317 moff = __btf_member_bit_offset(t, member) / 8; 318 if (moff > prev_mend && 319 memchr_inv(data + prev_mend, 0, moff - prev_mend)) 320 return -EINVAL; 321 322 mtype = btf_type_by_id(btf_vmlinux, member->type); 323 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 324 if (IS_ERR(mtype)) 325 return PTR_ERR(mtype); 326 prev_mend = moff + msize; 327 } 328 329 if (t->size > prev_mend && 330 memchr_inv(data + prev_mend, 0, t->size - prev_mend)) 331 return -EINVAL; 332 333 return 0; 334 } 335 336 static void bpf_struct_ops_link_release(struct bpf_link *link) 337 { 338 } 339 340 static void bpf_struct_ops_link_dealloc(struct bpf_link *link) 341 { 342 struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); 343 344 kfree(tlink); 345 } 346 347 const struct bpf_link_ops bpf_struct_ops_link_lops = { 348 .release = bpf_struct_ops_link_release, 349 .dealloc = bpf_struct_ops_link_dealloc, 350 }; 351 352 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, 353 struct bpf_tramp_link *link, 354 const struct btf_func_model *model, 355 void *image, void *image_end) 356 { 357 u32 flags; 358 359 tlinks[BPF_TRAMP_FENTRY].links[0] = link; 360 tlinks[BPF_TRAMP_FENTRY].nr_links = 1; 361 /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, 362 * and it must be used alone. 363 */ 364 flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; 365 return arch_prepare_bpf_trampoline(NULL, image, image_end, 366 model, flags, tlinks, NULL); 367 } 368 369 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, 370 void *value, u64 flags) 371 { 372 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 373 const struct bpf_struct_ops *st_ops = st_map->st_ops; 374 struct bpf_struct_ops_value *uvalue, *kvalue; 375 const struct btf_member *member; 376 const struct btf_type *t = st_ops->type; 377 struct bpf_tramp_links *tlinks; 378 void *udata, *kdata; 379 int prog_fd, err; 380 void *image, *image_end; 381 u32 i; 382 383 if (flags) 384 return -EINVAL; 385 386 if (*(u32 *)key != 0) 387 return -E2BIG; 388 389 err = check_zero_holes(st_ops->value_type, value); 390 if (err) 391 return err; 392 393 uvalue = value; 394 err = check_zero_holes(t, uvalue->data); 395 if (err) 396 return err; 397 398 if (uvalue->state || refcount_read(&uvalue->refcnt)) 399 return -EINVAL; 400 401 tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); 402 if (!tlinks) 403 return -ENOMEM; 404 405 uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; 406 kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; 407 408 mutex_lock(&st_map->lock); 409 410 if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { 411 err = -EBUSY; 412 goto unlock; 413 } 414 415 memcpy(uvalue, value, map->value_size); 416 417 udata = &uvalue->data; 418 kdata = &kvalue->data; 419 image = st_map->image; 420 image_end = st_map->image + PAGE_SIZE; 421 422 for_each_member(i, t, member) { 423 const struct btf_type *mtype, *ptype; 424 struct bpf_prog *prog; 425 struct bpf_tramp_link *link; 426 u32 moff; 427 428 moff = __btf_member_bit_offset(t, member) / 8; 429 ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); 430 if (ptype == module_type) { 431 if (*(void **)(udata + moff)) 432 goto reset_unlock; 433 *(void **)(kdata + moff) = BPF_MODULE_OWNER; 434 continue; 435 } 436 437 err = st_ops->init_member(t, member, kdata, udata); 438 if (err < 0) 439 goto reset_unlock; 440 441 /* The ->init_member() has handled this member */ 442 if (err > 0) 443 continue; 444 445 /* If st_ops->init_member does not handle it, 446 * we will only handle func ptrs and zero-ed members 447 * here. Reject everything else. 448 */ 449 450 /* All non func ptr member must be 0 */ 451 if (!ptype || !btf_type_is_func_proto(ptype)) { 452 u32 msize; 453 454 mtype = btf_type_by_id(btf_vmlinux, member->type); 455 mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); 456 if (IS_ERR(mtype)) { 457 err = PTR_ERR(mtype); 458 goto reset_unlock; 459 } 460 461 if (memchr_inv(udata + moff, 0, msize)) { 462 err = -EINVAL; 463 goto reset_unlock; 464 } 465 466 continue; 467 } 468 469 prog_fd = (int)(*(unsigned long *)(udata + moff)); 470 /* Similar check as the attr->attach_prog_fd */ 471 if (!prog_fd) 472 continue; 473 474 prog = bpf_prog_get(prog_fd); 475 if (IS_ERR(prog)) { 476 err = PTR_ERR(prog); 477 goto reset_unlock; 478 } 479 480 if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || 481 prog->aux->attach_btf_id != st_ops->type_id || 482 prog->expected_attach_type != i) { 483 bpf_prog_put(prog); 484 err = -EINVAL; 485 goto reset_unlock; 486 } 487 488 link = kzalloc(sizeof(*link), GFP_USER); 489 if (!link) { 490 bpf_prog_put(prog); 491 err = -ENOMEM; 492 goto reset_unlock; 493 } 494 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, 495 &bpf_struct_ops_link_lops, prog); 496 st_map->links[i] = &link->link; 497 498 err = bpf_struct_ops_prepare_trampoline(tlinks, link, 499 &st_ops->func_models[i], 500 image, image_end); 501 if (err < 0) 502 goto reset_unlock; 503 504 *(void **)(kdata + moff) = image; 505 image += err; 506 507 /* put prog_id to udata */ 508 *(unsigned long *)(udata + moff) = prog->aux->id; 509 } 510 511 if (st_map->map.map_flags & BPF_F_LINK) { 512 err = 0; 513 if (st_ops->validate) { 514 err = st_ops->validate(kdata); 515 if (err) 516 goto reset_unlock; 517 } 518 set_memory_rox((long)st_map->image, 1); 519 /* Let bpf_link handle registration & unregistration. 520 * 521 * Pair with smp_load_acquire() during lookup_elem(). 522 */ 523 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); 524 goto unlock; 525 } 526 527 set_memory_rox((long)st_map->image, 1); 528 err = st_ops->reg(kdata); 529 if (likely(!err)) { 530 /* This refcnt increment on the map here after 531 * 'st_ops->reg()' is secure since the state of the 532 * map must be set to INIT at this moment, and thus 533 * bpf_struct_ops_map_delete_elem() can't unregister 534 * or transition it to TOBEFREE concurrently. 535 */ 536 bpf_map_inc(map); 537 /* Pair with smp_load_acquire() during lookup_elem(). 538 * It ensures the above udata updates (e.g. prog->aux->id) 539 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. 540 */ 541 smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); 542 goto unlock; 543 } 544 545 /* Error during st_ops->reg(). Can happen if this struct_ops needs to be 546 * verified as a whole, after all init_member() calls. Can also happen if 547 * there was a race in registering the struct_ops (under the same name) to 548 * a sub-system through different struct_ops's maps. 549 */ 550 set_memory_nx((long)st_map->image, 1); 551 set_memory_rw((long)st_map->image, 1); 552 553 reset_unlock: 554 bpf_struct_ops_map_put_progs(st_map); 555 memset(uvalue, 0, map->value_size); 556 memset(kvalue, 0, map->value_size); 557 unlock: 558 kfree(tlinks); 559 mutex_unlock(&st_map->lock); 560 return err; 561 } 562 563 static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) 564 { 565 enum bpf_struct_ops_state prev_state; 566 struct bpf_struct_ops_map *st_map; 567 568 st_map = (struct bpf_struct_ops_map *)map; 569 if (st_map->map.map_flags & BPF_F_LINK) 570 return -EOPNOTSUPP; 571 572 prev_state = cmpxchg(&st_map->kvalue.state, 573 BPF_STRUCT_OPS_STATE_INUSE, 574 BPF_STRUCT_OPS_STATE_TOBEFREE); 575 switch (prev_state) { 576 case BPF_STRUCT_OPS_STATE_INUSE: 577 st_map->st_ops->unreg(&st_map->kvalue.data); 578 bpf_map_put(map); 579 return 0; 580 case BPF_STRUCT_OPS_STATE_TOBEFREE: 581 return -EINPROGRESS; 582 case BPF_STRUCT_OPS_STATE_INIT: 583 return -ENOENT; 584 default: 585 WARN_ON_ONCE(1); 586 /* Should never happen. Treat it as not found. */ 587 return -ENOENT; 588 } 589 } 590 591 static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, 592 struct seq_file *m) 593 { 594 void *value; 595 int err; 596 597 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 598 if (!value) 599 return; 600 601 err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); 602 if (!err) { 603 btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, 604 value, m); 605 seq_puts(m, "\n"); 606 } 607 608 kfree(value); 609 } 610 611 static void __bpf_struct_ops_map_free(struct bpf_map *map) 612 { 613 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 614 615 if (st_map->links) 616 bpf_struct_ops_map_put_progs(st_map); 617 bpf_map_area_free(st_map->links); 618 if (st_map->image) { 619 bpf_jit_free_exec(st_map->image); 620 bpf_jit_uncharge_modmem(PAGE_SIZE); 621 } 622 bpf_map_area_free(st_map->uvalue); 623 bpf_map_area_free(st_map); 624 } 625 626 static void bpf_struct_ops_map_free(struct bpf_map *map) 627 { 628 /* The struct_ops's function may switch to another struct_ops. 629 * 630 * For example, bpf_tcp_cc_x->init() may switch to 631 * another tcp_cc_y by calling 632 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). 633 * During the switch, bpf_struct_ops_put(tcp_cc_x) is called 634 * and its refcount may reach 0 which then free its 635 * trampoline image while tcp_cc_x is still running. 636 * 637 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog 638 * to finish. bpf-tcp-cc prog is non sleepable. 639 * A rcu_tasks gp is to wait for the last few insn 640 * in the tramopline image to finish before releasing 641 * the trampoline image. 642 */ 643 synchronize_rcu_mult(call_rcu, call_rcu_tasks); 644 645 __bpf_struct_ops_map_free(map); 646 } 647 648 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) 649 { 650 if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || 651 (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) 652 return -EINVAL; 653 return 0; 654 } 655 656 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) 657 { 658 const struct bpf_struct_ops *st_ops; 659 size_t st_map_size; 660 struct bpf_struct_ops_map *st_map; 661 const struct btf_type *t, *vt; 662 struct bpf_map *map; 663 int ret; 664 665 st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); 666 if (!st_ops) 667 return ERR_PTR(-ENOTSUPP); 668 669 vt = st_ops->value_type; 670 if (attr->value_size != vt->size) 671 return ERR_PTR(-EINVAL); 672 673 t = st_ops->type; 674 675 st_map_size = sizeof(*st_map) + 676 /* kvalue stores the 677 * struct bpf_struct_ops_tcp_congestions_ops 678 */ 679 (vt->size - sizeof(struct bpf_struct_ops_value)); 680 681 st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); 682 if (!st_map) 683 return ERR_PTR(-ENOMEM); 684 685 st_map->st_ops = st_ops; 686 map = &st_map->map; 687 688 ret = bpf_jit_charge_modmem(PAGE_SIZE); 689 if (ret) { 690 __bpf_struct_ops_map_free(map); 691 return ERR_PTR(ret); 692 } 693 694 st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); 695 if (!st_map->image) { 696 /* __bpf_struct_ops_map_free() uses st_map->image as flag 697 * for "charged or not". In this case, we need to unchange 698 * here. 699 */ 700 bpf_jit_uncharge_modmem(PAGE_SIZE); 701 __bpf_struct_ops_map_free(map); 702 return ERR_PTR(-ENOMEM); 703 } 704 st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); 705 st_map->links = 706 bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), 707 NUMA_NO_NODE); 708 if (!st_map->uvalue || !st_map->links) { 709 __bpf_struct_ops_map_free(map); 710 return ERR_PTR(-ENOMEM); 711 } 712 713 mutex_init(&st_map->lock); 714 set_vm_flush_reset_perms(st_map->image); 715 bpf_map_init_from_attr(map, attr); 716 717 return map; 718 } 719 720 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) 721 { 722 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 723 const struct bpf_struct_ops *st_ops = st_map->st_ops; 724 const struct btf_type *vt = st_ops->value_type; 725 u64 usage; 726 727 usage = sizeof(*st_map) + 728 vt->size - sizeof(struct bpf_struct_ops_value); 729 usage += vt->size; 730 usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); 731 usage += PAGE_SIZE; 732 return usage; 733 } 734 735 BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) 736 const struct bpf_map_ops bpf_struct_ops_map_ops = { 737 .map_alloc_check = bpf_struct_ops_map_alloc_check, 738 .map_alloc = bpf_struct_ops_map_alloc, 739 .map_free = bpf_struct_ops_map_free, 740 .map_get_next_key = bpf_struct_ops_map_get_next_key, 741 .map_lookup_elem = bpf_struct_ops_map_lookup_elem, 742 .map_delete_elem = bpf_struct_ops_map_delete_elem, 743 .map_update_elem = bpf_struct_ops_map_update_elem, 744 .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, 745 .map_mem_usage = bpf_struct_ops_map_mem_usage, 746 .map_btf_id = &bpf_struct_ops_map_btf_ids[0], 747 }; 748 749 /* "const void *" because some subsystem is 750 * passing a const (e.g. const struct tcp_congestion_ops *) 751 */ 752 bool bpf_struct_ops_get(const void *kdata) 753 { 754 struct bpf_struct_ops_value *kvalue; 755 struct bpf_struct_ops_map *st_map; 756 struct bpf_map *map; 757 758 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 759 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 760 761 map = __bpf_map_inc_not_zero(&st_map->map, false); 762 return !IS_ERR(map); 763 } 764 765 void bpf_struct_ops_put(const void *kdata) 766 { 767 struct bpf_struct_ops_value *kvalue; 768 struct bpf_struct_ops_map *st_map; 769 770 kvalue = container_of(kdata, struct bpf_struct_ops_value, data); 771 st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); 772 773 bpf_map_put(&st_map->map); 774 } 775 776 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) 777 { 778 struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; 779 780 return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && 781 map->map_flags & BPF_F_LINK && 782 /* Pair with smp_store_release() during map_update */ 783 smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; 784 } 785 786 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) 787 { 788 struct bpf_struct_ops_link *st_link; 789 struct bpf_struct_ops_map *st_map; 790 791 st_link = container_of(link, struct bpf_struct_ops_link, link); 792 st_map = (struct bpf_struct_ops_map *) 793 rcu_dereference_protected(st_link->map, true); 794 if (st_map) { 795 /* st_link->map can be NULL if 796 * bpf_struct_ops_link_create() fails to register. 797 */ 798 st_map->st_ops->unreg(&st_map->kvalue.data); 799 bpf_map_put(&st_map->map); 800 } 801 kfree(st_link); 802 } 803 804 static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, 805 struct seq_file *seq) 806 { 807 struct bpf_struct_ops_link *st_link; 808 struct bpf_map *map; 809 810 st_link = container_of(link, struct bpf_struct_ops_link, link); 811 rcu_read_lock(); 812 map = rcu_dereference(st_link->map); 813 seq_printf(seq, "map_id:\t%d\n", map->id); 814 rcu_read_unlock(); 815 } 816 817 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, 818 struct bpf_link_info *info) 819 { 820 struct bpf_struct_ops_link *st_link; 821 struct bpf_map *map; 822 823 st_link = container_of(link, struct bpf_struct_ops_link, link); 824 rcu_read_lock(); 825 map = rcu_dereference(st_link->map); 826 info->struct_ops.map_id = map->id; 827 rcu_read_unlock(); 828 return 0; 829 } 830 831 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, 832 struct bpf_map *expected_old_map) 833 { 834 struct bpf_struct_ops_map *st_map, *old_st_map; 835 struct bpf_map *old_map; 836 struct bpf_struct_ops_link *st_link; 837 int err; 838 839 st_link = container_of(link, struct bpf_struct_ops_link, link); 840 st_map = container_of(new_map, struct bpf_struct_ops_map, map); 841 842 if (!bpf_struct_ops_valid_to_reg(new_map)) 843 return -EINVAL; 844 845 if (!st_map->st_ops->update) 846 return -EOPNOTSUPP; 847 848 mutex_lock(&update_mutex); 849 850 old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); 851 if (expected_old_map && old_map != expected_old_map) { 852 err = -EPERM; 853 goto err_out; 854 } 855 856 old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); 857 /* The new and old struct_ops must be the same type. */ 858 if (st_map->st_ops != old_st_map->st_ops) { 859 err = -EINVAL; 860 goto err_out; 861 } 862 863 err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); 864 if (err) 865 goto err_out; 866 867 bpf_map_inc(new_map); 868 rcu_assign_pointer(st_link->map, new_map); 869 bpf_map_put(old_map); 870 871 err_out: 872 mutex_unlock(&update_mutex); 873 874 return err; 875 } 876 877 static const struct bpf_link_ops bpf_struct_ops_map_lops = { 878 .dealloc = bpf_struct_ops_map_link_dealloc, 879 .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, 880 .fill_link_info = bpf_struct_ops_map_link_fill_link_info, 881 .update_map = bpf_struct_ops_map_link_update, 882 }; 883 884 int bpf_struct_ops_link_create(union bpf_attr *attr) 885 { 886 struct bpf_struct_ops_link *link = NULL; 887 struct bpf_link_primer link_primer; 888 struct bpf_struct_ops_map *st_map; 889 struct bpf_map *map; 890 int err; 891 892 map = bpf_map_get(attr->link_create.map_fd); 893 if (IS_ERR(map)) 894 return PTR_ERR(map); 895 896 st_map = (struct bpf_struct_ops_map *)map; 897 898 if (!bpf_struct_ops_valid_to_reg(map)) { 899 err = -EINVAL; 900 goto err_out; 901 } 902 903 link = kzalloc(sizeof(*link), GFP_USER); 904 if (!link) { 905 err = -ENOMEM; 906 goto err_out; 907 } 908 bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); 909 910 err = bpf_link_prime(&link->link, &link_primer); 911 if (err) 912 goto err_out; 913 914 err = st_map->st_ops->reg(st_map->kvalue.data); 915 if (err) { 916 bpf_link_cleanup(&link_primer); 917 link = NULL; 918 goto err_out; 919 } 920 RCU_INIT_POINTER(link->map, map); 921 922 return bpf_link_settle(&link_primer); 923 924 err_out: 925 bpf_map_put(map); 926 kfree(link); 927 return err; 928 } 929