1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2019 Facebook */ 3 #include <linux/rculist.h> 4 #include <linux/list.h> 5 #include <linux/hash.h> 6 #include <linux/types.h> 7 #include <linux/spinlock.h> 8 #include <linux/bpf.h> 9 #include <linux/btf_ids.h> 10 #include <linux/bpf_local_storage.h> 11 #include <net/sock.h> 12 #include <uapi/linux/sock_diag.h> 13 #include <uapi/linux/btf.h> 14 #include <linux/rcupdate.h> 15 #include <linux/rcupdate_trace.h> 16 #include <linux/rcupdate_wait.h> 17 18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) 19 20 static struct bpf_local_storage_map_bucket * 21 select_bucket(struct bpf_local_storage_map *smap, 22 struct bpf_local_storage *local_storage) 23 { 24 return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; 25 } 26 27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) 28 { 29 struct bpf_map *map = &smap->map; 30 31 if (!map->ops->map_local_storage_charge) 32 return 0; 33 34 return map->ops->map_local_storage_charge(smap, owner, size); 35 } 36 37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, 38 u32 size) 39 { 40 struct bpf_map *map = &smap->map; 41 42 if (map->ops->map_local_storage_uncharge) 43 map->ops->map_local_storage_uncharge(smap, owner, size); 44 } 45 46 static struct bpf_local_storage __rcu ** 47 owner_storage(struct bpf_local_storage_map *smap, void *owner) 48 { 49 struct bpf_map *map = &smap->map; 50 51 return map->ops->map_owner_storage_ptr(owner); 52 } 53 54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) 55 { 56 return !hlist_unhashed_lockless(&selem->snode); 57 } 58 59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) 60 { 61 return !hlist_unhashed(&selem->snode); 62 } 63 64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) 65 { 66 return !hlist_unhashed(&selem->map_node); 67 } 68 69 struct bpf_local_storage_elem * 70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, 71 void *value, bool swap_uptrs, gfp_t gfp_flags) 72 { 73 struct bpf_local_storage_elem *selem; 74 75 if (mem_charge(smap, owner, smap->elem_size)) 76 return NULL; 77 78 if (smap->use_kmalloc_nolock) { 79 selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, 80 __GFP_ZERO, NUMA_NO_NODE); 81 } else { 82 selem = bpf_map_kzalloc(&smap->map, smap->elem_size, 83 gfp_flags | __GFP_NOWARN); 84 } 85 86 if (selem) { 87 RCU_INIT_POINTER(SDATA(selem)->smap, smap); 88 atomic_set(&selem->state, 0); 89 selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; 90 91 if (value) { 92 /* No need to call check_and_init_map_value as memory is zero init */ 93 copy_map_value(&smap->map, SDATA(selem)->data, value); 94 if (swap_uptrs) 95 bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); 96 } 97 return selem; 98 } 99 100 mem_uncharge(smap, owner, smap->elem_size); 101 102 return NULL; 103 } 104 105 /* rcu tasks trace callback for use_kmalloc_nolock == false */ 106 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) 107 { 108 struct bpf_local_storage *local_storage; 109 110 /* If RCU Tasks Trace grace period implies RCU grace period, do 111 * kfree(), else do kfree_rcu(). 112 */ 113 local_storage = container_of(rcu, struct bpf_local_storage, rcu); 114 if (rcu_trace_implies_rcu_gp()) 115 kfree(local_storage); 116 else 117 kfree_rcu(local_storage, rcu); 118 } 119 120 /* Handle use_kmalloc_nolock == false */ 121 static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, 122 bool vanilla_rcu) 123 { 124 if (vanilla_rcu) 125 kfree_rcu(local_storage, rcu); 126 else 127 call_rcu_tasks_trace(&local_storage->rcu, 128 __bpf_local_storage_free_trace_rcu); 129 } 130 131 static void bpf_local_storage_free_rcu(struct rcu_head *rcu) 132 { 133 struct bpf_local_storage *local_storage; 134 135 local_storage = container_of(rcu, struct bpf_local_storage, rcu); 136 kfree_nolock(local_storage); 137 } 138 139 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) 140 { 141 if (rcu_trace_implies_rcu_gp()) 142 bpf_local_storage_free_rcu(rcu); 143 else 144 call_rcu(rcu, bpf_local_storage_free_rcu); 145 } 146 147 static void bpf_local_storage_free(struct bpf_local_storage *local_storage, 148 bool reuse_now) 149 { 150 if (!local_storage) 151 return; 152 153 if (!local_storage->use_kmalloc_nolock) { 154 __bpf_local_storage_free(local_storage, reuse_now); 155 return; 156 } 157 158 if (reuse_now) { 159 call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); 160 return; 161 } 162 163 call_rcu_tasks_trace(&local_storage->rcu, 164 bpf_local_storage_free_trace_rcu); 165 } 166 167 /* rcu tasks trace callback for use_kmalloc_nolock == false */ 168 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) 169 { 170 struct bpf_local_storage_elem *selem; 171 172 selem = container_of(rcu, struct bpf_local_storage_elem, rcu); 173 if (rcu_trace_implies_rcu_gp()) 174 kfree(selem); 175 else 176 kfree_rcu(selem, rcu); 177 } 178 179 /* Handle use_kmalloc_nolock == false */ 180 static void __bpf_selem_free(struct bpf_local_storage_elem *selem, 181 bool vanilla_rcu) 182 { 183 if (vanilla_rcu) 184 kfree_rcu(selem, rcu); 185 else 186 call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); 187 } 188 189 static void bpf_selem_free_rcu(struct rcu_head *rcu) 190 { 191 struct bpf_local_storage_elem *selem; 192 struct bpf_local_storage_map *smap; 193 194 selem = container_of(rcu, struct bpf_local_storage_elem, rcu); 195 /* The bpf_local_storage_map_free will wait for rcu_barrier */ 196 smap = rcu_dereference_check(SDATA(selem)->smap, 1); 197 198 if (smap) { 199 migrate_disable(); 200 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 201 migrate_enable(); 202 } 203 kfree_nolock(selem); 204 } 205 206 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) 207 { 208 if (rcu_trace_implies_rcu_gp()) 209 bpf_selem_free_rcu(rcu); 210 else 211 call_rcu(rcu, bpf_selem_free_rcu); 212 } 213 214 void bpf_selem_free(struct bpf_local_storage_elem *selem, 215 bool reuse_now) 216 { 217 struct bpf_local_storage_map *smap; 218 219 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 220 221 if (!selem->use_kmalloc_nolock) { 222 /* 223 * No uptr will be unpin even when reuse_now == false since uptr 224 * is only supported in task local storage, where 225 * smap->use_kmalloc_nolock == true. 226 */ 227 if (smap) 228 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 229 __bpf_selem_free(selem, reuse_now); 230 return; 231 } 232 233 if (reuse_now) { 234 /* 235 * While it is okay to call bpf_obj_free_fields() that unpins uptr when 236 * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. 237 */ 238 call_rcu(&selem->rcu, bpf_selem_free_rcu); 239 return; 240 } 241 242 call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); 243 } 244 245 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) 246 { 247 struct bpf_local_storage_elem *selem; 248 struct hlist_node *n; 249 250 /* The "_safe" iteration is needed. 251 * The loop is not removing the selem from the list 252 * but bpf_selem_free will use the selem->rcu_head 253 * which is union-ized with the selem->free_node. 254 */ 255 hlist_for_each_entry_safe(selem, n, list, free_node) 256 bpf_selem_free(selem, reuse_now); 257 } 258 259 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, 260 struct bpf_local_storage_map *smap, 261 struct bpf_local_storage *local_storage, 262 bool free_local_storage, bool pin_owner) 263 { 264 void *owner = local_storage->owner; 265 u32 uncharge = smap->elem_size; 266 267 if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == 268 SDATA(selem)) 269 RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); 270 271 if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) 272 return; 273 274 uncharge += free_local_storage ? sizeof(*local_storage) : 0; 275 mem_uncharge(smap, local_storage->owner, uncharge); 276 local_storage->mem_charge -= uncharge; 277 278 if (free_local_storage) { 279 local_storage->owner = NULL; 280 281 /* After this RCU_INIT, owner may be freed and cannot be used */ 282 RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); 283 } 284 285 if (pin_owner) 286 refcount_dec(&local_storage->owner_refcnt); 287 } 288 289 /* local_storage->lock must be held and selem->local_storage == local_storage. 290 * The caller must ensure selem->smap is still valid to be 291 * dereferenced for its smap->elem_size and smap->cache_idx. 292 */ 293 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, 294 struct bpf_local_storage_elem *selem, 295 struct hlist_head *free_selem_list) 296 { 297 struct bpf_local_storage_map *smap; 298 bool free_local_storage; 299 300 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 301 302 free_local_storage = hlist_is_singular_node(&selem->snode, 303 &local_storage->list); 304 305 bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, 306 free_local_storage, false); 307 308 hlist_del_init_rcu(&selem->snode); 309 310 hlist_add_head(&selem->free_node, free_selem_list); 311 312 return free_local_storage; 313 } 314 315 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, 316 struct bpf_local_storage_elem *selem) 317 { 318 struct bpf_local_storage_map *smap; 319 320 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 321 local_storage->mem_charge += smap->elem_size; 322 323 RCU_INIT_POINTER(selem->local_storage, local_storage); 324 hlist_add_head_rcu(&selem->snode, &local_storage->list); 325 } 326 327 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) 328 { 329 struct bpf_local_storage *local_storage; 330 struct bpf_local_storage_map *smap; 331 struct bpf_local_storage_map_bucket *b; 332 unsigned long flags; 333 int err; 334 335 local_storage = rcu_dereference_check(selem->local_storage, 336 bpf_rcu_lock_held()); 337 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 338 b = select_bucket(smap, local_storage); 339 err = raw_res_spin_lock_irqsave(&b->lock, flags); 340 if (err) 341 return err; 342 343 hlist_del_init_rcu(&selem->map_node); 344 raw_res_spin_unlock_irqrestore(&b->lock, flags); 345 346 return 0; 347 } 348 349 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) 350 { 351 hlist_del_init_rcu(&selem->map_node); 352 } 353 354 int bpf_selem_link_map(struct bpf_local_storage_map *smap, 355 struct bpf_local_storage *local_storage, 356 struct bpf_local_storage_elem *selem) 357 { 358 struct bpf_local_storage_map_bucket *b; 359 unsigned long flags; 360 int err; 361 362 b = select_bucket(smap, local_storage); 363 364 err = raw_res_spin_lock_irqsave(&b->lock, flags); 365 if (err) 366 return err; 367 368 hlist_add_head_rcu(&selem->map_node, &b->list); 369 raw_res_spin_unlock_irqrestore(&b->lock, flags); 370 371 return 0; 372 } 373 374 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, 375 struct bpf_local_storage_elem *selem) 376 { 377 hlist_add_head_rcu(&selem->map_node, &b->list); 378 } 379 380 /* 381 * Unlink an selem from map and local storage with lock held. 382 * This is the common path used by local storages to delete an selem. 383 */ 384 int bpf_selem_unlink(struct bpf_local_storage_elem *selem) 385 { 386 struct bpf_local_storage *local_storage; 387 bool free_local_storage = false; 388 HLIST_HEAD(selem_free_list); 389 unsigned long flags; 390 int err; 391 392 if (unlikely(!selem_linked_to_storage_lockless(selem))) 393 /* selem has already been unlinked from sk */ 394 return 0; 395 396 local_storage = rcu_dereference_check(selem->local_storage, 397 bpf_rcu_lock_held()); 398 399 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); 400 if (err) 401 return err; 402 403 if (likely(selem_linked_to_storage(selem))) { 404 /* Always unlink from map before unlinking from local_storage 405 * because selem will be freed after successfully unlinked from 406 * the local_storage. 407 */ 408 err = bpf_selem_unlink_map(selem); 409 if (err) 410 goto out; 411 412 free_local_storage = bpf_selem_unlink_storage_nolock( 413 local_storage, selem, &selem_free_list); 414 } 415 out: 416 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); 417 418 bpf_selem_free_list(&selem_free_list, false); 419 420 if (free_local_storage) 421 bpf_local_storage_free(local_storage, false); 422 423 return err; 424 } 425 426 /* 427 * Unlink an selem from map and local storage with lockless fallback if callers 428 * are racing or rqspinlock returns error. It should only be called by 429 * bpf_local_storage_destroy() or bpf_local_storage_map_free(). 430 */ 431 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, 432 struct bpf_local_storage_map_bucket *b) 433 { 434 bool in_map_free = !!b, free_storage = false; 435 struct bpf_local_storage *local_storage; 436 struct bpf_local_storage_map *smap; 437 unsigned long flags; 438 int err, unlink = 0; 439 440 local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); 441 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 442 443 if (smap) { 444 b = b ? : select_bucket(smap, local_storage); 445 err = raw_res_spin_lock_irqsave(&b->lock, flags); 446 if (!err) { 447 /* 448 * Call bpf_obj_free_fields() under b->lock to make sure it is done 449 * exactly once for an selem. Safe to free special fields immediately 450 * as no BPF program should be referencing the selem. 451 */ 452 if (likely(selem_linked_to_map(selem))) { 453 hlist_del_init_rcu(&selem->map_node); 454 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 455 unlink++; 456 } 457 raw_res_spin_unlock_irqrestore(&b->lock, flags); 458 } 459 /* 460 * Highly unlikely scenario: resource leak 461 * 462 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing 463 * and both selem belong to the same bucket, if destroy(selem2) acquired 464 * b->lock and block for too long, neither map_free(selem1) and 465 * destroy(selem1) will be able to free the special field associated 466 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. 467 */ 468 WARN_ON_ONCE(err && in_map_free); 469 if (!err || in_map_free) 470 RCU_INIT_POINTER(SDATA(selem)->smap, NULL); 471 } 472 473 if (local_storage) { 474 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); 475 if (!err) { 476 if (likely(selem_linked_to_storage(selem))) { 477 free_storage = hlist_is_singular_node(&selem->snode, 478 &local_storage->list); 479 /* 480 * Okay to skip clearing owner_storage and storage->owner in 481 * destroy() since the owner is going away. No user or bpf 482 * programs should be able to reference it. 483 */ 484 if (smap && in_map_free) 485 bpf_selem_unlink_storage_nolock_misc( 486 selem, smap, local_storage, 487 free_storage, true); 488 hlist_del_init_rcu(&selem->snode); 489 unlink++; 490 } 491 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); 492 } 493 if (!err || !in_map_free) 494 RCU_INIT_POINTER(selem->local_storage, NULL); 495 } 496 497 if (unlink != 2) 498 atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); 499 500 /* 501 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and 502 * then freed after an RCU grace period. However, if destroy() and map_free() are 503 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free 504 * the selem only after both map_free() and destroy() see the selem. 505 */ 506 if (unlink == 2 || 507 atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) 508 bpf_selem_free(selem, true); 509 510 if (free_storage) 511 bpf_local_storage_free(local_storage, true); 512 } 513 514 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, 515 struct bpf_local_storage_map *smap, 516 struct bpf_local_storage_elem *selem) 517 { 518 unsigned long flags; 519 int err; 520 521 /* spinlock is needed to avoid racing with the 522 * parallel delete. Otherwise, publishing an already 523 * deleted sdata to the cache will become a use-after-free 524 * problem in the next bpf_local_storage_lookup(). 525 */ 526 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); 527 if (err) 528 return; 529 530 if (selem_linked_to_storage(selem)) 531 rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); 532 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); 533 } 534 535 static int check_flags(const struct bpf_local_storage_data *old_sdata, 536 u64 map_flags) 537 { 538 if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 539 /* elem already exists */ 540 return -EEXIST; 541 542 if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 543 /* elem doesn't exist, cannot update it */ 544 return -ENOENT; 545 546 return 0; 547 } 548 549 int bpf_local_storage_alloc(void *owner, 550 struct bpf_local_storage_map *smap, 551 struct bpf_local_storage_elem *first_selem, 552 gfp_t gfp_flags) 553 { 554 struct bpf_local_storage *prev_storage, *storage; 555 struct bpf_local_storage **owner_storage_ptr; 556 struct bpf_local_storage_map_bucket *b; 557 unsigned long flags; 558 int err; 559 560 err = mem_charge(smap, owner, sizeof(*storage)); 561 if (err) 562 return err; 563 564 if (smap->use_kmalloc_nolock) 565 storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), 566 __GFP_ZERO, NUMA_NO_NODE); 567 else 568 storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), 569 gfp_flags | __GFP_NOWARN); 570 if (!storage) { 571 err = -ENOMEM; 572 goto uncharge; 573 } 574 575 INIT_HLIST_HEAD(&storage->list); 576 raw_res_spin_lock_init(&storage->lock); 577 storage->owner = owner; 578 storage->mem_charge = sizeof(*storage); 579 storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; 580 refcount_set(&storage->owner_refcnt, 1); 581 582 bpf_selem_link_storage_nolock(storage, first_selem); 583 584 b = select_bucket(smap, storage); 585 err = raw_res_spin_lock_irqsave(&b->lock, flags); 586 if (err) 587 goto uncharge; 588 589 bpf_selem_link_map_nolock(b, first_selem); 590 591 owner_storage_ptr = 592 (struct bpf_local_storage **)owner_storage(smap, owner); 593 /* Publish storage to the owner. 594 * Instead of using any lock of the kernel object (i.e. owner), 595 * cmpxchg will work with any kernel object regardless what 596 * the running context is, bh, irq...etc. 597 * 598 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) 599 * is protected by the storage->lock. Hence, when freeing 600 * the owner->storage, the storage->lock must be held before 601 * setting owner->storage ptr to NULL. 602 */ 603 prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); 604 if (unlikely(prev_storage)) { 605 bpf_selem_unlink_map_nolock(first_selem); 606 raw_res_spin_unlock_irqrestore(&b->lock, flags); 607 err = -EAGAIN; 608 goto uncharge; 609 } 610 raw_res_spin_unlock_irqrestore(&b->lock, flags); 611 612 return 0; 613 614 uncharge: 615 bpf_local_storage_free(storage, true); 616 mem_uncharge(smap, owner, sizeof(*storage)); 617 return err; 618 } 619 620 /* sk cannot be going away because it is linking new elem 621 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). 622 * Otherwise, it will become a leak (and other memory issues 623 * during map destruction). 624 */ 625 struct bpf_local_storage_data * 626 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, 627 void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) 628 { 629 struct bpf_local_storage_data *old_sdata = NULL; 630 struct bpf_local_storage_elem *alloc_selem, *selem = NULL; 631 struct bpf_local_storage *local_storage; 632 struct bpf_local_storage_map_bucket *b; 633 HLIST_HEAD(old_selem_free_list); 634 unsigned long flags, b_flags; 635 int err; 636 637 /* BPF_EXIST and BPF_NOEXIST cannot be both set */ 638 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || 639 /* BPF_F_LOCK can only be used in a value with spin_lock */ 640 unlikely((map_flags & BPF_F_LOCK) && 641 !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) 642 return ERR_PTR(-EINVAL); 643 644 if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) 645 return ERR_PTR(-EINVAL); 646 647 local_storage = rcu_dereference_check(*owner_storage(smap, owner), 648 bpf_rcu_lock_held()); 649 if (!local_storage || hlist_empty(&local_storage->list)) { 650 /* Very first elem for the owner */ 651 err = check_flags(NULL, map_flags); 652 if (err) 653 return ERR_PTR(err); 654 655 selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); 656 if (!selem) 657 return ERR_PTR(-ENOMEM); 658 659 err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); 660 if (err) { 661 bpf_selem_free(selem, true); 662 mem_uncharge(smap, owner, smap->elem_size); 663 return ERR_PTR(err); 664 } 665 666 return SDATA(selem); 667 } 668 669 if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { 670 /* Hoping to find an old_sdata to do inline update 671 * such that it can avoid taking the local_storage->lock 672 * and changing the lists. 673 */ 674 old_sdata = 675 bpf_local_storage_lookup(local_storage, smap, false); 676 err = check_flags(old_sdata, map_flags); 677 if (err) 678 return ERR_PTR(err); 679 if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) { 680 copy_map_value_locked(&smap->map, old_sdata->data, 681 value, false); 682 return old_sdata; 683 } 684 } 685 686 /* A lookup has just been done before and concluded a new selem is 687 * needed. The chance of an unnecessary alloc is unlikely. 688 */ 689 alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); 690 if (!alloc_selem) 691 return ERR_PTR(-ENOMEM); 692 693 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); 694 if (err) 695 goto free_selem; 696 697 /* Recheck local_storage->list under local_storage->lock */ 698 if (unlikely(hlist_empty(&local_storage->list))) { 699 /* A parallel del is happening and local_storage is going 700 * away. It has just been checked before, so very 701 * unlikely. Return instead of retry to keep things 702 * simple. 703 */ 704 err = -EAGAIN; 705 goto unlock; 706 } 707 708 old_sdata = bpf_local_storage_lookup(local_storage, smap, false); 709 err = check_flags(old_sdata, map_flags); 710 if (err) 711 goto unlock; 712 713 if (old_sdata && (map_flags & BPF_F_LOCK)) { 714 copy_map_value_locked(&smap->map, old_sdata->data, value, 715 false); 716 selem = SELEM(old_sdata); 717 goto unlock; 718 } 719 720 b = select_bucket(smap, local_storage); 721 722 err = raw_res_spin_lock_irqsave(&b->lock, b_flags); 723 if (err) 724 goto unlock; 725 726 alloc_selem = NULL; 727 /* First, link the new selem to the map */ 728 bpf_selem_link_map_nolock(b, selem); 729 730 /* Second, link (and publish) the new selem to local_storage */ 731 bpf_selem_link_storage_nolock(local_storage, selem); 732 733 /* Third, remove old selem, SELEM(old_sdata) */ 734 if (old_sdata) { 735 bpf_selem_unlink_map_nolock(SELEM(old_sdata)); 736 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), 737 &old_selem_free_list); 738 } 739 740 raw_res_spin_unlock_irqrestore(&b->lock, b_flags); 741 unlock: 742 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); 743 free_selem: 744 bpf_selem_free_list(&old_selem_free_list, false); 745 if (alloc_selem) { 746 mem_uncharge(smap, owner, smap->elem_size); 747 bpf_selem_free(alloc_selem, true); 748 } 749 return err ? ERR_PTR(err) : SDATA(selem); 750 } 751 752 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) 753 { 754 u64 min_usage = U64_MAX; 755 u16 i, res = 0; 756 757 spin_lock(&cache->idx_lock); 758 759 for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { 760 if (cache->idx_usage_counts[i] < min_usage) { 761 min_usage = cache->idx_usage_counts[i]; 762 res = i; 763 764 /* Found a free cache_idx */ 765 if (!min_usage) 766 break; 767 } 768 } 769 cache->idx_usage_counts[res]++; 770 771 spin_unlock(&cache->idx_lock); 772 773 return res; 774 } 775 776 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, 777 u16 idx) 778 { 779 spin_lock(&cache->idx_lock); 780 cache->idx_usage_counts[idx]--; 781 spin_unlock(&cache->idx_lock); 782 } 783 784 int bpf_local_storage_map_alloc_check(union bpf_attr *attr) 785 { 786 if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || 787 !(attr->map_flags & BPF_F_NO_PREALLOC) || 788 attr->max_entries || 789 attr->key_size != sizeof(int) || !attr->value_size || 790 /* Enforce BTF for userspace sk dumping */ 791 !attr->btf_key_type_id || !attr->btf_value_type_id) 792 return -EINVAL; 793 794 if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) 795 return -E2BIG; 796 797 return 0; 798 } 799 800 int bpf_local_storage_map_check_btf(const struct bpf_map *map, 801 const struct btf *btf, 802 const struct btf_type *key_type, 803 const struct btf_type *value_type) 804 { 805 if (!btf_type_is_i32(key_type)) 806 return -EINVAL; 807 808 return 0; 809 } 810 811 /* 812 * Destroy local storage when the owner is going away. Caller must uncharge memory 813 * if memory charging is used. 814 */ 815 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) 816 { 817 struct bpf_local_storage_elem *selem; 818 819 /* Neither the bpf_prog nor the bpf_map's syscall 820 * could be modifying the local_storage->list now. 821 * Thus, no elem can be added to or deleted from the 822 * local_storage->list by the bpf_prog or by the bpf_map's syscall. 823 * 824 * It is racing with bpf_local_storage_map_free() alone 825 * when unlinking elem from the local_storage->list and 826 * the map's bucket->list. 827 */ 828 hlist_for_each_entry_rcu(selem, &local_storage->list, snode) 829 bpf_selem_unlink_nofail(selem, NULL); 830 831 if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { 832 while (refcount_read(&local_storage->owner_refcnt)) 833 cpu_relax(); 834 /* 835 * Paired with refcount_dec() in bpf_selem_unlink_nofail() 836 * to make sure destroy() sees the correct local_storage->mem_charge. 837 */ 838 smp_mb(); 839 } 840 841 return local_storage->mem_charge; 842 } 843 844 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) 845 { 846 struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; 847 u64 usage = sizeof(*smap); 848 849 /* The dynamically callocated selems are not counted currently. */ 850 usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); 851 return usage; 852 } 853 854 struct bpf_map * 855 bpf_local_storage_map_alloc(union bpf_attr *attr, 856 struct bpf_local_storage_cache *cache, 857 bool use_kmalloc_nolock) 858 { 859 struct bpf_local_storage_map *smap; 860 unsigned int i; 861 u32 nbuckets; 862 int err; 863 864 smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); 865 if (!smap) 866 return ERR_PTR(-ENOMEM); 867 bpf_map_init_from_attr(&smap->map, attr); 868 869 nbuckets = roundup_pow_of_two(num_possible_cpus()); 870 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ 871 nbuckets = max_t(u32, 2, nbuckets); 872 smap->bucket_log = ilog2(nbuckets); 873 874 smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, 875 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); 876 if (!smap->buckets) { 877 err = -ENOMEM; 878 goto free_smap; 879 } 880 881 for (i = 0; i < nbuckets; i++) { 882 INIT_HLIST_HEAD(&smap->buckets[i].list); 883 raw_res_spin_lock_init(&smap->buckets[i].lock); 884 } 885 886 smap->elem_size = offsetof(struct bpf_local_storage_elem, 887 sdata.data[attr->value_size]); 888 889 /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non 890 * preemptible context. Thus, enforce all storages to use 891 * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. 892 */ 893 smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; 894 895 smap->cache_idx = bpf_local_storage_cache_idx_get(cache); 896 return &smap->map; 897 898 free_smap: 899 kvfree(smap->buckets); 900 bpf_map_area_free(smap); 901 return ERR_PTR(err); 902 } 903 904 void bpf_local_storage_map_free(struct bpf_map *map, 905 struct bpf_local_storage_cache *cache) 906 { 907 struct bpf_local_storage_map_bucket *b; 908 struct bpf_local_storage_elem *selem; 909 struct bpf_local_storage_map *smap; 910 unsigned int i; 911 912 smap = (struct bpf_local_storage_map *)map; 913 bpf_local_storage_cache_idx_free(cache, smap->cache_idx); 914 915 /* Note that this map might be concurrently cloned from 916 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone 917 * RCU read section to finish before proceeding. New RCU 918 * read sections should be prevented via bpf_map_inc_not_zero. 919 */ 920 synchronize_rcu(); 921 922 /* bpf prog and the userspace can no longer access this map 923 * now. No new selem (of this map) can be added 924 * to the owner->storage or to the map bucket's list. 925 * 926 * The elem of this map can be cleaned up here 927 * or when the storage is freed e.g. 928 * by bpf_sk_storage_free() during __sk_destruct(). 929 */ 930 for (i = 0; i < (1U << smap->bucket_log); i++) { 931 b = &smap->buckets[i]; 932 933 rcu_read_lock(); 934 /* No one is adding to b->list now */ 935 restart: 936 hlist_for_each_entry_rcu(selem, &b->list, map_node) { 937 bpf_selem_unlink_nofail(selem, b); 938 939 if (need_resched()) { 940 cond_resched_rcu(); 941 goto restart; 942 } 943 } 944 rcu_read_unlock(); 945 } 946 947 /* While freeing the storage we may still need to access the map. 948 * 949 * e.g. when bpf_sk_storage_free() has unlinked selem from the map 950 * which then made the above while((selem = ...)) loop 951 * exit immediately. 952 * 953 * However, while freeing the storage one still needs to access the 954 * smap->elem_size to do the uncharging in 955 * bpf_selem_unlink_storage_nolock(). 956 * 957 * Hence, wait another rcu grace period for the storage to be freed. 958 */ 959 synchronize_rcu(); 960 961 if (smap->use_kmalloc_nolock) { 962 rcu_barrier_tasks_trace(); 963 rcu_barrier(); 964 } 965 kvfree(smap->buckets); 966 bpf_map_area_free(smap); 967 } 968