1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/rcupdate_trace.h> 13 #include "percpu_freelist.h" 14 #include "bpf_lru_list.h" 15 #include "map_in_map.h" 16 17 #define HTAB_CREATE_FLAG_MASK \ 18 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 19 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 20 21 #define BATCH_OPS(_name) \ 22 .map_lookup_batch = \ 23 _name##_map_lookup_batch, \ 24 .map_lookup_and_delete_batch = \ 25 _name##_map_lookup_and_delete_batch, \ 26 .map_update_batch = \ 27 generic_map_update_batch, \ 28 .map_delete_batch = \ 29 generic_map_delete_batch 30 31 /* 32 * The bucket lock has two protection scopes: 33 * 34 * 1) Serializing concurrent operations from BPF programs on differrent 35 * CPUs 36 * 37 * 2) Serializing concurrent operations from BPF programs and sys_bpf() 38 * 39 * BPF programs can execute in any context including perf, kprobes and 40 * tracing. As there are almost no limits where perf, kprobes and tracing 41 * can be invoked from the lock operations need to be protected against 42 * deadlocks. Deadlocks can be caused by recursion and by an invocation in 43 * the lock held section when functions which acquire this lock are invoked 44 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU 45 * variable bpf_prog_active, which prevents BPF programs attached to perf 46 * events, kprobes and tracing to be invoked before the prior invocation 47 * from one of these contexts completed. sys_bpf() uses the same mechanism 48 * by pinning the task to the current CPU and incrementing the recursion 49 * protection accross the map operation. 50 * 51 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain 52 * operations like memory allocations (even with GFP_ATOMIC) from atomic 53 * contexts. This is required because even with GFP_ATOMIC the memory 54 * allocator calls into code pathes which acquire locks with long held lock 55 * sections. To ensure the deterministic behaviour these locks are regular 56 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only 57 * true atomic contexts on an RT kernel are the low level hardware 58 * handling, scheduling, low level interrupt handling, NMIs etc. None of 59 * these contexts should ever do memory allocations. 60 * 61 * As regular device interrupt handlers and soft interrupts are forced into 62 * thread context, the existing code which does 63 * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); 64 * just works. 65 * 66 * In theory the BPF locks could be converted to regular spinlocks as well, 67 * but the bucket locks and percpu_freelist locks can be taken from 68 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be 69 * atomic contexts even on RT. These mechanisms require preallocated maps, 70 * so there is no need to invoke memory allocations within the lock held 71 * sections. 72 * 73 * BPF maps which need dynamic allocation are only used from (forced) 74 * thread context on RT and can therefore use regular spinlocks which in 75 * turn allows to invoke memory allocations from the lock held section. 76 * 77 * On a non RT kernel this distinction is neither possible nor required. 78 * spinlock maps to raw_spinlock and the extra code is optimized out by the 79 * compiler. 80 */ 81 struct bucket { 82 struct hlist_nulls_head head; 83 union { 84 raw_spinlock_t raw_lock; 85 spinlock_t lock; 86 }; 87 }; 88 89 #define HASHTAB_MAP_LOCK_COUNT 8 90 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) 91 92 struct bpf_htab { 93 struct bpf_map map; 94 struct bucket *buckets; 95 void *elems; 96 union { 97 struct pcpu_freelist freelist; 98 struct bpf_lru lru; 99 }; 100 struct htab_elem *__percpu *extra_elems; 101 atomic_t count; /* number of elements in this hashtable */ 102 u32 n_buckets; /* number of hash buckets */ 103 u32 elem_size; /* size of each element in bytes */ 104 u32 hashrnd; 105 struct lock_class_key lockdep_key; 106 int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 107 }; 108 109 /* each htab element is struct htab_elem + key + value */ 110 struct htab_elem { 111 union { 112 struct hlist_nulls_node hash_node; 113 struct { 114 void *padding; 115 union { 116 struct bpf_htab *htab; 117 struct pcpu_freelist_node fnode; 118 struct htab_elem *batch_flink; 119 }; 120 }; 121 }; 122 union { 123 struct rcu_head rcu; 124 struct bpf_lru_node lru_node; 125 }; 126 u32 hash; 127 char key[] __aligned(8); 128 }; 129 130 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 131 { 132 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 133 } 134 135 static inline bool htab_use_raw_lock(const struct bpf_htab *htab) 136 { 137 return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); 138 } 139 140 static void htab_init_buckets(struct bpf_htab *htab) 141 { 142 unsigned i; 143 144 for (i = 0; i < htab->n_buckets; i++) { 145 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 146 if (htab_use_raw_lock(htab)) { 147 raw_spin_lock_init(&htab->buckets[i].raw_lock); 148 lockdep_set_class(&htab->buckets[i].raw_lock, 149 &htab->lockdep_key); 150 } else { 151 spin_lock_init(&htab->buckets[i].lock); 152 lockdep_set_class(&htab->buckets[i].lock, 153 &htab->lockdep_key); 154 } 155 } 156 } 157 158 static inline int htab_lock_bucket(const struct bpf_htab *htab, 159 struct bucket *b, u32 hash, 160 unsigned long *pflags) 161 { 162 unsigned long flags; 163 164 hash = hash & HASHTAB_MAP_LOCK_MASK; 165 166 migrate_disable(); 167 if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 168 __this_cpu_dec(*(htab->map_locked[hash])); 169 migrate_enable(); 170 return -EBUSY; 171 } 172 173 if (htab_use_raw_lock(htab)) 174 raw_spin_lock_irqsave(&b->raw_lock, flags); 175 else 176 spin_lock_irqsave(&b->lock, flags); 177 *pflags = flags; 178 179 return 0; 180 } 181 182 static inline void htab_unlock_bucket(const struct bpf_htab *htab, 183 struct bucket *b, u32 hash, 184 unsigned long flags) 185 { 186 hash = hash & HASHTAB_MAP_LOCK_MASK; 187 if (htab_use_raw_lock(htab)) 188 raw_spin_unlock_irqrestore(&b->raw_lock, flags); 189 else 190 spin_unlock_irqrestore(&b->lock, flags); 191 __this_cpu_dec(*(htab->map_locked[hash])); 192 migrate_enable(); 193 } 194 195 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 196 197 static bool htab_is_lru(const struct bpf_htab *htab) 198 { 199 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 200 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 201 } 202 203 static bool htab_is_percpu(const struct bpf_htab *htab) 204 { 205 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 206 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 207 } 208 209 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 210 void __percpu *pptr) 211 { 212 *(void __percpu **)(l->key + key_size) = pptr; 213 } 214 215 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 216 { 217 return *(void __percpu **)(l->key + key_size); 218 } 219 220 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 221 { 222 return *(void **)(l->key + roundup(map->key_size, 8)); 223 } 224 225 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 226 { 227 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 228 } 229 230 static void htab_free_elems(struct bpf_htab *htab) 231 { 232 int i; 233 234 if (!htab_is_percpu(htab)) 235 goto free_elems; 236 237 for (i = 0; i < htab->map.max_entries; i++) { 238 void __percpu *pptr; 239 240 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 241 htab->map.key_size); 242 free_percpu(pptr); 243 cond_resched(); 244 } 245 free_elems: 246 bpf_map_area_free(htab->elems); 247 } 248 249 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 250 * (bucket_lock). If both locks need to be acquired together, the lock 251 * order is always lru_lock -> bucket_lock and this only happens in 252 * bpf_lru_list.c logic. For example, certain code path of 253 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 254 * will acquire lru_lock first followed by acquiring bucket_lock. 255 * 256 * In hashtab.c, to avoid deadlock, lock acquisition of 257 * bucket_lock followed by lru_lock is not allowed. In such cases, 258 * bucket_lock needs to be released first before acquiring lru_lock. 259 */ 260 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 261 u32 hash) 262 { 263 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 264 struct htab_elem *l; 265 266 if (node) { 267 l = container_of(node, struct htab_elem, lru_node); 268 memcpy(l->key, key, htab->map.key_size); 269 return l; 270 } 271 272 return NULL; 273 } 274 275 static int prealloc_init(struct bpf_htab *htab) 276 { 277 u32 num_entries = htab->map.max_entries; 278 int err = -ENOMEM, i; 279 280 if (!htab_is_percpu(htab) && !htab_is_lru(htab)) 281 num_entries += num_possible_cpus(); 282 283 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, 284 htab->map.numa_node); 285 if (!htab->elems) 286 return -ENOMEM; 287 288 if (!htab_is_percpu(htab)) 289 goto skip_percpu_elems; 290 291 for (i = 0; i < num_entries; i++) { 292 u32 size = round_up(htab->map.value_size, 8); 293 void __percpu *pptr; 294 295 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 296 if (!pptr) 297 goto free_elems; 298 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 299 pptr); 300 cond_resched(); 301 } 302 303 skip_percpu_elems: 304 if (htab_is_lru(htab)) 305 err = bpf_lru_init(&htab->lru, 306 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 307 offsetof(struct htab_elem, hash) - 308 offsetof(struct htab_elem, lru_node), 309 htab_lru_map_delete_node, 310 htab); 311 else 312 err = pcpu_freelist_init(&htab->freelist); 313 314 if (err) 315 goto free_elems; 316 317 if (htab_is_lru(htab)) 318 bpf_lru_populate(&htab->lru, htab->elems, 319 offsetof(struct htab_elem, lru_node), 320 htab->elem_size, num_entries); 321 else 322 pcpu_freelist_populate(&htab->freelist, 323 htab->elems + offsetof(struct htab_elem, fnode), 324 htab->elem_size, num_entries); 325 326 return 0; 327 328 free_elems: 329 htab_free_elems(htab); 330 return err; 331 } 332 333 static void prealloc_destroy(struct bpf_htab *htab) 334 { 335 htab_free_elems(htab); 336 337 if (htab_is_lru(htab)) 338 bpf_lru_destroy(&htab->lru); 339 else 340 pcpu_freelist_destroy(&htab->freelist); 341 } 342 343 static int alloc_extra_elems(struct bpf_htab *htab) 344 { 345 struct htab_elem *__percpu *pptr, *l_new; 346 struct pcpu_freelist_node *l; 347 int cpu; 348 349 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, 350 GFP_USER | __GFP_NOWARN); 351 if (!pptr) 352 return -ENOMEM; 353 354 for_each_possible_cpu(cpu) { 355 l = pcpu_freelist_pop(&htab->freelist); 356 /* pop will succeed, since prealloc_init() 357 * preallocated extra num_possible_cpus elements 358 */ 359 l_new = container_of(l, struct htab_elem, fnode); 360 *per_cpu_ptr(pptr, cpu) = l_new; 361 } 362 htab->extra_elems = pptr; 363 return 0; 364 } 365 366 /* Called from syscall */ 367 static int htab_map_alloc_check(union bpf_attr *attr) 368 { 369 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 370 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 371 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 372 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 373 /* percpu_lru means each cpu has its own LRU list. 374 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 375 * the map's value itself is percpu. percpu_lru has 376 * nothing to do with the map's value. 377 */ 378 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 379 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 380 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 381 int numa_node = bpf_map_attr_numa_node(attr); 382 383 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 384 offsetof(struct htab_elem, hash_node.pprev)); 385 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 386 offsetof(struct htab_elem, hash_node.pprev)); 387 388 if (lru && !bpf_capable()) 389 /* LRU implementation is much complicated than other 390 * maps. Hence, limit to CAP_BPF. 391 */ 392 return -EPERM; 393 394 if (zero_seed && !capable(CAP_SYS_ADMIN)) 395 /* Guard against local DoS, and discourage production use. */ 396 return -EPERM; 397 398 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 399 !bpf_map_flags_access_ok(attr->map_flags)) 400 return -EINVAL; 401 402 if (!lru && percpu_lru) 403 return -EINVAL; 404 405 if (lru && !prealloc) 406 return -ENOTSUPP; 407 408 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 409 return -EINVAL; 410 411 /* check sanity of attributes. 412 * value_size == 0 may be allowed in the future to use map as a set 413 */ 414 if (attr->max_entries == 0 || attr->key_size == 0 || 415 attr->value_size == 0) 416 return -EINVAL; 417 418 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - 419 sizeof(struct htab_elem)) 420 /* if key_size + value_size is bigger, the user space won't be 421 * able to access the elements via bpf syscall. This check 422 * also makes sure that the elem_size doesn't overflow and it's 423 * kmalloc-able later in htab_map_update_elem() 424 */ 425 return -E2BIG; 426 427 return 0; 428 } 429 430 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 431 { 432 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 433 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 434 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 435 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 436 /* percpu_lru means each cpu has its own LRU list. 437 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 438 * the map's value itself is percpu. percpu_lru has 439 * nothing to do with the map's value. 440 */ 441 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 442 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 443 struct bpf_htab *htab; 444 int err, i; 445 u64 cost; 446 447 htab = kzalloc(sizeof(*htab), GFP_USER); 448 if (!htab) 449 return ERR_PTR(-ENOMEM); 450 451 lockdep_register_key(&htab->lockdep_key); 452 453 bpf_map_init_from_attr(&htab->map, attr); 454 455 if (percpu_lru) { 456 /* ensure each CPU's lru list has >=1 elements. 457 * since we are at it, make each lru list has the same 458 * number of elements. 459 */ 460 htab->map.max_entries = roundup(attr->max_entries, 461 num_possible_cpus()); 462 if (htab->map.max_entries < attr->max_entries) 463 htab->map.max_entries = rounddown(attr->max_entries, 464 num_possible_cpus()); 465 } 466 467 /* hash table size must be power of 2 */ 468 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 469 470 htab->elem_size = sizeof(struct htab_elem) + 471 round_up(htab->map.key_size, 8); 472 if (percpu) 473 htab->elem_size += sizeof(void *); 474 else 475 htab->elem_size += round_up(htab->map.value_size, 8); 476 477 err = -E2BIG; 478 /* prevent zero size kmalloc and check for u32 overflow */ 479 if (htab->n_buckets == 0 || 480 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 481 goto free_htab; 482 483 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 484 (u64) htab->elem_size * htab->map.max_entries; 485 486 if (percpu) 487 cost += (u64) round_up(htab->map.value_size, 8) * 488 num_possible_cpus() * htab->map.max_entries; 489 else 490 cost += (u64) htab->elem_size * num_possible_cpus(); 491 492 /* if map size is larger than memlock limit, reject it */ 493 err = bpf_map_charge_init(&htab->map.memory, cost); 494 if (err) 495 goto free_htab; 496 497 err = -ENOMEM; 498 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 499 sizeof(struct bucket), 500 htab->map.numa_node); 501 if (!htab->buckets) 502 goto free_charge; 503 504 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 505 htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), 506 sizeof(int), GFP_USER); 507 if (!htab->map_locked[i]) 508 goto free_map_locked; 509 } 510 511 if (htab->map.map_flags & BPF_F_ZERO_SEED) 512 htab->hashrnd = 0; 513 else 514 htab->hashrnd = get_random_int(); 515 516 htab_init_buckets(htab); 517 518 if (prealloc) { 519 err = prealloc_init(htab); 520 if (err) 521 goto free_map_locked; 522 523 if (!percpu && !lru) { 524 /* lru itself can remove the least used element, so 525 * there is no need for an extra elem during map_update. 526 */ 527 err = alloc_extra_elems(htab); 528 if (err) 529 goto free_prealloc; 530 } 531 } 532 533 return &htab->map; 534 535 free_prealloc: 536 prealloc_destroy(htab); 537 free_map_locked: 538 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 539 free_percpu(htab->map_locked[i]); 540 bpf_map_area_free(htab->buckets); 541 free_charge: 542 bpf_map_charge_finish(&htab->map.memory); 543 free_htab: 544 lockdep_unregister_key(&htab->lockdep_key); 545 kfree(htab); 546 return ERR_PTR(err); 547 } 548 549 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 550 { 551 return jhash(key, key_len, hashrnd); 552 } 553 554 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 555 { 556 return &htab->buckets[hash & (htab->n_buckets - 1)]; 557 } 558 559 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 560 { 561 return &__select_bucket(htab, hash)->head; 562 } 563 564 /* this lookup function can only be called with bucket lock taken */ 565 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 566 void *key, u32 key_size) 567 { 568 struct hlist_nulls_node *n; 569 struct htab_elem *l; 570 571 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 572 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 573 return l; 574 575 return NULL; 576 } 577 578 /* can be called without bucket lock. it will repeat the loop in 579 * the unlikely event when elements moved from one bucket into another 580 * while link list is being walked 581 */ 582 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 583 u32 hash, void *key, 584 u32 key_size, u32 n_buckets) 585 { 586 struct hlist_nulls_node *n; 587 struct htab_elem *l; 588 589 again: 590 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 591 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 592 return l; 593 594 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 595 goto again; 596 597 return NULL; 598 } 599 600 /* Called from syscall or from eBPF program directly, so 601 * arguments have to match bpf_map_lookup_elem() exactly. 602 * The return value is adjusted by BPF instructions 603 * in htab_map_gen_lookup(). 604 */ 605 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 606 { 607 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 608 struct hlist_nulls_head *head; 609 struct htab_elem *l; 610 u32 hash, key_size; 611 612 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 613 614 key_size = map->key_size; 615 616 hash = htab_map_hash(key, key_size, htab->hashrnd); 617 618 head = select_bucket(htab, hash); 619 620 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 621 622 return l; 623 } 624 625 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 626 { 627 struct htab_elem *l = __htab_map_lookup_elem(map, key); 628 629 if (l) 630 return l->key + round_up(map->key_size, 8); 631 632 return NULL; 633 } 634 635 /* inline bpf_map_lookup_elem() call. 636 * Instead of: 637 * bpf_prog 638 * bpf_map_lookup_elem 639 * map->ops->map_lookup_elem 640 * htab_map_lookup_elem 641 * __htab_map_lookup_elem 642 * do: 643 * bpf_prog 644 * __htab_map_lookup_elem 645 */ 646 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 647 { 648 struct bpf_insn *insn = insn_buf; 649 const int ret = BPF_REG_0; 650 651 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 652 (void *(*)(struct bpf_map *map, void *key))NULL)); 653 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 654 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 655 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 656 offsetof(struct htab_elem, key) + 657 round_up(map->key_size, 8)); 658 return insn - insn_buf; 659 } 660 661 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 662 void *key, const bool mark) 663 { 664 struct htab_elem *l = __htab_map_lookup_elem(map, key); 665 666 if (l) { 667 if (mark) 668 bpf_lru_node_set_ref(&l->lru_node); 669 return l->key + round_up(map->key_size, 8); 670 } 671 672 return NULL; 673 } 674 675 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 676 { 677 return __htab_lru_map_lookup_elem(map, key, true); 678 } 679 680 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 681 { 682 return __htab_lru_map_lookup_elem(map, key, false); 683 } 684 685 static int htab_lru_map_gen_lookup(struct bpf_map *map, 686 struct bpf_insn *insn_buf) 687 { 688 struct bpf_insn *insn = insn_buf; 689 const int ret = BPF_REG_0; 690 const int ref_reg = BPF_REG_1; 691 692 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 693 (void *(*)(struct bpf_map *map, void *key))NULL)); 694 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 695 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 696 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 697 offsetof(struct htab_elem, lru_node) + 698 offsetof(struct bpf_lru_node, ref)); 699 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 700 *insn++ = BPF_ST_MEM(BPF_B, ret, 701 offsetof(struct htab_elem, lru_node) + 702 offsetof(struct bpf_lru_node, ref), 703 1); 704 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 705 offsetof(struct htab_elem, key) + 706 round_up(map->key_size, 8)); 707 return insn - insn_buf; 708 } 709 710 /* It is called from the bpf_lru_list when the LRU needs to delete 711 * older elements from the htab. 712 */ 713 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 714 { 715 struct bpf_htab *htab = (struct bpf_htab *)arg; 716 struct htab_elem *l = NULL, *tgt_l; 717 struct hlist_nulls_head *head; 718 struct hlist_nulls_node *n; 719 unsigned long flags; 720 struct bucket *b; 721 int ret; 722 723 tgt_l = container_of(node, struct htab_elem, lru_node); 724 b = __select_bucket(htab, tgt_l->hash); 725 head = &b->head; 726 727 ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 728 if (ret) 729 return false; 730 731 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 732 if (l == tgt_l) { 733 hlist_nulls_del_rcu(&l->hash_node); 734 break; 735 } 736 737 htab_unlock_bucket(htab, b, tgt_l->hash, flags); 738 739 return l == tgt_l; 740 } 741 742 /* Called from syscall */ 743 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 744 { 745 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 746 struct hlist_nulls_head *head; 747 struct htab_elem *l, *next_l; 748 u32 hash, key_size; 749 int i = 0; 750 751 WARN_ON_ONCE(!rcu_read_lock_held()); 752 753 key_size = map->key_size; 754 755 if (!key) 756 goto find_first_elem; 757 758 hash = htab_map_hash(key, key_size, htab->hashrnd); 759 760 head = select_bucket(htab, hash); 761 762 /* lookup the key */ 763 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 764 765 if (!l) 766 goto find_first_elem; 767 768 /* key was found, get next key in the same bucket */ 769 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 770 struct htab_elem, hash_node); 771 772 if (next_l) { 773 /* if next elem in this hash list is non-zero, just return it */ 774 memcpy(next_key, next_l->key, key_size); 775 return 0; 776 } 777 778 /* no more elements in this hash list, go to the next bucket */ 779 i = hash & (htab->n_buckets - 1); 780 i++; 781 782 find_first_elem: 783 /* iterate over buckets */ 784 for (; i < htab->n_buckets; i++) { 785 head = select_bucket(htab, i); 786 787 /* pick first element in the bucket */ 788 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 789 struct htab_elem, hash_node); 790 if (next_l) { 791 /* if it's not empty, just return it */ 792 memcpy(next_key, next_l->key, key_size); 793 return 0; 794 } 795 } 796 797 /* iterated over all buckets and all elements */ 798 return -ENOENT; 799 } 800 801 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 802 { 803 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 804 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 805 kfree(l); 806 } 807 808 static void htab_elem_free_rcu(struct rcu_head *head) 809 { 810 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 811 struct bpf_htab *htab = l->htab; 812 813 htab_elem_free(htab, l); 814 } 815 816 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) 817 { 818 struct bpf_map *map = &htab->map; 819 void *ptr; 820 821 if (map->ops->map_fd_put_ptr) { 822 ptr = fd_htab_map_get_ptr(map, l); 823 map->ops->map_fd_put_ptr(ptr); 824 } 825 } 826 827 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 828 { 829 htab_put_fd_value(htab, l); 830 831 if (htab_is_prealloc(htab)) { 832 __pcpu_freelist_push(&htab->freelist, &l->fnode); 833 } else { 834 atomic_dec(&htab->count); 835 l->htab = htab; 836 call_rcu(&l->rcu, htab_elem_free_rcu); 837 } 838 } 839 840 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 841 void *value, bool onallcpus) 842 { 843 if (!onallcpus) { 844 /* copy true value_size bytes */ 845 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 846 } else { 847 u32 size = round_up(htab->map.value_size, 8); 848 int off = 0, cpu; 849 850 for_each_possible_cpu(cpu) { 851 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 852 value + off, size); 853 off += size; 854 } 855 } 856 } 857 858 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 859 { 860 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 861 BITS_PER_LONG == 64; 862 } 863 864 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 865 void *value, u32 key_size, u32 hash, 866 bool percpu, bool onallcpus, 867 struct htab_elem *old_elem) 868 { 869 u32 size = htab->map.value_size; 870 bool prealloc = htab_is_prealloc(htab); 871 struct htab_elem *l_new, **pl_new; 872 void __percpu *pptr; 873 874 if (prealloc) { 875 if (old_elem) { 876 /* if we're updating the existing element, 877 * use per-cpu extra elems to avoid freelist_pop/push 878 */ 879 pl_new = this_cpu_ptr(htab->extra_elems); 880 l_new = *pl_new; 881 htab_put_fd_value(htab, old_elem); 882 *pl_new = old_elem; 883 } else { 884 struct pcpu_freelist_node *l; 885 886 l = __pcpu_freelist_pop(&htab->freelist); 887 if (!l) 888 return ERR_PTR(-E2BIG); 889 l_new = container_of(l, struct htab_elem, fnode); 890 } 891 } else { 892 if (atomic_inc_return(&htab->count) > htab->map.max_entries) 893 if (!old_elem) { 894 /* when map is full and update() is replacing 895 * old element, it's ok to allocate, since 896 * old element will be freed immediately. 897 * Otherwise return an error 898 */ 899 l_new = ERR_PTR(-E2BIG); 900 goto dec_count; 901 } 902 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 903 htab->map.numa_node); 904 if (!l_new) { 905 l_new = ERR_PTR(-ENOMEM); 906 goto dec_count; 907 } 908 check_and_init_map_lock(&htab->map, 909 l_new->key + round_up(key_size, 8)); 910 } 911 912 memcpy(l_new->key, key, key_size); 913 if (percpu) { 914 size = round_up(size, 8); 915 if (prealloc) { 916 pptr = htab_elem_get_ptr(l_new, key_size); 917 } else { 918 /* alloc_percpu zero-fills */ 919 pptr = __alloc_percpu_gfp(size, 8, 920 GFP_ATOMIC | __GFP_NOWARN); 921 if (!pptr) { 922 kfree(l_new); 923 l_new = ERR_PTR(-ENOMEM); 924 goto dec_count; 925 } 926 } 927 928 pcpu_copy_value(htab, pptr, value, onallcpus); 929 930 if (!prealloc) 931 htab_elem_set_ptr(l_new, key_size, pptr); 932 } else if (fd_htab_map_needs_adjust(htab)) { 933 size = round_up(size, 8); 934 memcpy(l_new->key + round_up(key_size, 8), value, size); 935 } else { 936 copy_map_value(&htab->map, 937 l_new->key + round_up(key_size, 8), 938 value); 939 } 940 941 l_new->hash = hash; 942 return l_new; 943 dec_count: 944 atomic_dec(&htab->count); 945 return l_new; 946 } 947 948 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 949 u64 map_flags) 950 { 951 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 952 /* elem already exists */ 953 return -EEXIST; 954 955 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 956 /* elem doesn't exist, cannot update it */ 957 return -ENOENT; 958 959 return 0; 960 } 961 962 /* Called from syscall or from eBPF program */ 963 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 964 u64 map_flags) 965 { 966 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 967 struct htab_elem *l_new = NULL, *l_old; 968 struct hlist_nulls_head *head; 969 unsigned long flags; 970 struct bucket *b; 971 u32 key_size, hash; 972 int ret; 973 974 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 975 /* unknown flags */ 976 return -EINVAL; 977 978 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 979 980 key_size = map->key_size; 981 982 hash = htab_map_hash(key, key_size, htab->hashrnd); 983 984 b = __select_bucket(htab, hash); 985 head = &b->head; 986 987 if (unlikely(map_flags & BPF_F_LOCK)) { 988 if (unlikely(!map_value_has_spin_lock(map))) 989 return -EINVAL; 990 /* find an element without taking the bucket lock */ 991 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 992 htab->n_buckets); 993 ret = check_flags(htab, l_old, map_flags); 994 if (ret) 995 return ret; 996 if (l_old) { 997 /* grab the element lock and update value in place */ 998 copy_map_value_locked(map, 999 l_old->key + round_up(key_size, 8), 1000 value, false); 1001 return 0; 1002 } 1003 /* fall through, grab the bucket lock and lookup again. 1004 * 99.9% chance that the element won't be found, 1005 * but second lookup under lock has to be done. 1006 */ 1007 } 1008 1009 ret = htab_lock_bucket(htab, b, hash, &flags); 1010 if (ret) 1011 return ret; 1012 1013 l_old = lookup_elem_raw(head, hash, key, key_size); 1014 1015 ret = check_flags(htab, l_old, map_flags); 1016 if (ret) 1017 goto err; 1018 1019 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 1020 /* first lookup without the bucket lock didn't find the element, 1021 * but second lookup with the bucket lock found it. 1022 * This case is highly unlikely, but has to be dealt with: 1023 * grab the element lock in addition to the bucket lock 1024 * and update element in place 1025 */ 1026 copy_map_value_locked(map, 1027 l_old->key + round_up(key_size, 8), 1028 value, false); 1029 ret = 0; 1030 goto err; 1031 } 1032 1033 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 1034 l_old); 1035 if (IS_ERR(l_new)) { 1036 /* all pre-allocated elements are in use or memory exhausted */ 1037 ret = PTR_ERR(l_new); 1038 goto err; 1039 } 1040 1041 /* add new element to the head of the list, so that 1042 * concurrent search will find it before old elem 1043 */ 1044 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1045 if (l_old) { 1046 hlist_nulls_del_rcu(&l_old->hash_node); 1047 if (!htab_is_prealloc(htab)) 1048 free_htab_elem(htab, l_old); 1049 } 1050 ret = 0; 1051 err: 1052 htab_unlock_bucket(htab, b, hash, flags); 1053 return ret; 1054 } 1055 1056 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 1057 u64 map_flags) 1058 { 1059 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1060 struct htab_elem *l_new, *l_old = NULL; 1061 struct hlist_nulls_head *head; 1062 unsigned long flags; 1063 struct bucket *b; 1064 u32 key_size, hash; 1065 int ret; 1066 1067 if (unlikely(map_flags > BPF_EXIST)) 1068 /* unknown flags */ 1069 return -EINVAL; 1070 1071 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1072 1073 key_size = map->key_size; 1074 1075 hash = htab_map_hash(key, key_size, htab->hashrnd); 1076 1077 b = __select_bucket(htab, hash); 1078 head = &b->head; 1079 1080 /* For LRU, we need to alloc before taking bucket's 1081 * spinlock because getting free nodes from LRU may need 1082 * to remove older elements from htab and this removal 1083 * operation will need a bucket lock. 1084 */ 1085 l_new = prealloc_lru_pop(htab, key, hash); 1086 if (!l_new) 1087 return -ENOMEM; 1088 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 1089 1090 ret = htab_lock_bucket(htab, b, hash, &flags); 1091 if (ret) 1092 return ret; 1093 1094 l_old = lookup_elem_raw(head, hash, key, key_size); 1095 1096 ret = check_flags(htab, l_old, map_flags); 1097 if (ret) 1098 goto err; 1099 1100 /* add new element to the head of the list, so that 1101 * concurrent search will find it before old elem 1102 */ 1103 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1104 if (l_old) { 1105 bpf_lru_node_set_ref(&l_new->lru_node); 1106 hlist_nulls_del_rcu(&l_old->hash_node); 1107 } 1108 ret = 0; 1109 1110 err: 1111 htab_unlock_bucket(htab, b, hash, flags); 1112 1113 if (ret) 1114 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1115 else if (l_old) 1116 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 1117 1118 return ret; 1119 } 1120 1121 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1122 void *value, u64 map_flags, 1123 bool onallcpus) 1124 { 1125 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1126 struct htab_elem *l_new = NULL, *l_old; 1127 struct hlist_nulls_head *head; 1128 unsigned long flags; 1129 struct bucket *b; 1130 u32 key_size, hash; 1131 int ret; 1132 1133 if (unlikely(map_flags > BPF_EXIST)) 1134 /* unknown flags */ 1135 return -EINVAL; 1136 1137 WARN_ON_ONCE(!rcu_read_lock_held()); 1138 1139 key_size = map->key_size; 1140 1141 hash = htab_map_hash(key, key_size, htab->hashrnd); 1142 1143 b = __select_bucket(htab, hash); 1144 head = &b->head; 1145 1146 ret = htab_lock_bucket(htab, b, hash, &flags); 1147 if (ret) 1148 return ret; 1149 1150 l_old = lookup_elem_raw(head, hash, key, key_size); 1151 1152 ret = check_flags(htab, l_old, map_flags); 1153 if (ret) 1154 goto err; 1155 1156 if (l_old) { 1157 /* per-cpu hash map can update value in-place */ 1158 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1159 value, onallcpus); 1160 } else { 1161 l_new = alloc_htab_elem(htab, key, value, key_size, 1162 hash, true, onallcpus, NULL); 1163 if (IS_ERR(l_new)) { 1164 ret = PTR_ERR(l_new); 1165 goto err; 1166 } 1167 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1168 } 1169 ret = 0; 1170 err: 1171 htab_unlock_bucket(htab, b, hash, flags); 1172 return ret; 1173 } 1174 1175 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1176 void *value, u64 map_flags, 1177 bool onallcpus) 1178 { 1179 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1180 struct htab_elem *l_new = NULL, *l_old; 1181 struct hlist_nulls_head *head; 1182 unsigned long flags; 1183 struct bucket *b; 1184 u32 key_size, hash; 1185 int ret; 1186 1187 if (unlikely(map_flags > BPF_EXIST)) 1188 /* unknown flags */ 1189 return -EINVAL; 1190 1191 WARN_ON_ONCE(!rcu_read_lock_held()); 1192 1193 key_size = map->key_size; 1194 1195 hash = htab_map_hash(key, key_size, htab->hashrnd); 1196 1197 b = __select_bucket(htab, hash); 1198 head = &b->head; 1199 1200 /* For LRU, we need to alloc before taking bucket's 1201 * spinlock because LRU's elem alloc may need 1202 * to remove older elem from htab and this removal 1203 * operation will need a bucket lock. 1204 */ 1205 if (map_flags != BPF_EXIST) { 1206 l_new = prealloc_lru_pop(htab, key, hash); 1207 if (!l_new) 1208 return -ENOMEM; 1209 } 1210 1211 ret = htab_lock_bucket(htab, b, hash, &flags); 1212 if (ret) 1213 return ret; 1214 1215 l_old = lookup_elem_raw(head, hash, key, key_size); 1216 1217 ret = check_flags(htab, l_old, map_flags); 1218 if (ret) 1219 goto err; 1220 1221 if (l_old) { 1222 bpf_lru_node_set_ref(&l_old->lru_node); 1223 1224 /* per-cpu hash map can update value in-place */ 1225 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1226 value, onallcpus); 1227 } else { 1228 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 1229 value, onallcpus); 1230 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1231 l_new = NULL; 1232 } 1233 ret = 0; 1234 err: 1235 htab_unlock_bucket(htab, b, hash, flags); 1236 if (l_new) 1237 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1238 return ret; 1239 } 1240 1241 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1242 void *value, u64 map_flags) 1243 { 1244 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1245 } 1246 1247 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1248 void *value, u64 map_flags) 1249 { 1250 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1251 false); 1252 } 1253 1254 /* Called from syscall or from eBPF program */ 1255 static int htab_map_delete_elem(struct bpf_map *map, void *key) 1256 { 1257 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1258 struct hlist_nulls_head *head; 1259 struct bucket *b; 1260 struct htab_elem *l; 1261 unsigned long flags; 1262 u32 hash, key_size; 1263 int ret; 1264 1265 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1266 1267 key_size = map->key_size; 1268 1269 hash = htab_map_hash(key, key_size, htab->hashrnd); 1270 b = __select_bucket(htab, hash); 1271 head = &b->head; 1272 1273 ret = htab_lock_bucket(htab, b, hash, &flags); 1274 if (ret) 1275 return ret; 1276 1277 l = lookup_elem_raw(head, hash, key, key_size); 1278 1279 if (l) { 1280 hlist_nulls_del_rcu(&l->hash_node); 1281 free_htab_elem(htab, l); 1282 } else { 1283 ret = -ENOENT; 1284 } 1285 1286 htab_unlock_bucket(htab, b, hash, flags); 1287 return ret; 1288 } 1289 1290 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1291 { 1292 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1293 struct hlist_nulls_head *head; 1294 struct bucket *b; 1295 struct htab_elem *l; 1296 unsigned long flags; 1297 u32 hash, key_size; 1298 int ret; 1299 1300 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1301 1302 key_size = map->key_size; 1303 1304 hash = htab_map_hash(key, key_size, htab->hashrnd); 1305 b = __select_bucket(htab, hash); 1306 head = &b->head; 1307 1308 ret = htab_lock_bucket(htab, b, hash, &flags); 1309 if (ret) 1310 return ret; 1311 1312 l = lookup_elem_raw(head, hash, key, key_size); 1313 1314 if (l) 1315 hlist_nulls_del_rcu(&l->hash_node); 1316 else 1317 ret = -ENOENT; 1318 1319 htab_unlock_bucket(htab, b, hash, flags); 1320 if (l) 1321 bpf_lru_push_free(&htab->lru, &l->lru_node); 1322 return ret; 1323 } 1324 1325 static void delete_all_elements(struct bpf_htab *htab) 1326 { 1327 int i; 1328 1329 for (i = 0; i < htab->n_buckets; i++) { 1330 struct hlist_nulls_head *head = select_bucket(htab, i); 1331 struct hlist_nulls_node *n; 1332 struct htab_elem *l; 1333 1334 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1335 hlist_nulls_del_rcu(&l->hash_node); 1336 htab_elem_free(htab, l); 1337 } 1338 } 1339 } 1340 1341 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1342 static void htab_map_free(struct bpf_map *map) 1343 { 1344 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1345 int i; 1346 1347 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1348 * bpf_free_used_maps() is called after bpf prog is no longer executing. 1349 * There is no need to synchronize_rcu() here to protect map elements. 1350 */ 1351 1352 /* some of free_htab_elem() callbacks for elements of this map may 1353 * not have executed. Wait for them. 1354 */ 1355 rcu_barrier(); 1356 if (!htab_is_prealloc(htab)) 1357 delete_all_elements(htab); 1358 else 1359 prealloc_destroy(htab); 1360 1361 free_percpu(htab->extra_elems); 1362 bpf_map_area_free(htab->buckets); 1363 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1364 free_percpu(htab->map_locked[i]); 1365 lockdep_unregister_key(&htab->lockdep_key); 1366 kfree(htab); 1367 } 1368 1369 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1370 struct seq_file *m) 1371 { 1372 void *value; 1373 1374 rcu_read_lock(); 1375 1376 value = htab_map_lookup_elem(map, key); 1377 if (!value) { 1378 rcu_read_unlock(); 1379 return; 1380 } 1381 1382 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1383 seq_puts(m, ": "); 1384 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1385 seq_puts(m, "\n"); 1386 1387 rcu_read_unlock(); 1388 } 1389 1390 static int 1391 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1392 const union bpf_attr *attr, 1393 union bpf_attr __user *uattr, 1394 bool do_delete, bool is_lru_map, 1395 bool is_percpu) 1396 { 1397 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1398 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1399 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1400 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1401 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1402 void *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1403 u32 batch, max_count, size, bucket_size; 1404 struct htab_elem *node_to_free = NULL; 1405 u64 elem_map_flags, map_flags; 1406 struct hlist_nulls_head *head; 1407 struct hlist_nulls_node *n; 1408 unsigned long flags = 0; 1409 bool locked = false; 1410 struct htab_elem *l; 1411 struct bucket *b; 1412 int ret = 0; 1413 1414 elem_map_flags = attr->batch.elem_flags; 1415 if ((elem_map_flags & ~BPF_F_LOCK) || 1416 ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) 1417 return -EINVAL; 1418 1419 map_flags = attr->batch.flags; 1420 if (map_flags) 1421 return -EINVAL; 1422 1423 max_count = attr->batch.count; 1424 if (!max_count) 1425 return 0; 1426 1427 if (put_user(0, &uattr->batch.count)) 1428 return -EFAULT; 1429 1430 batch = 0; 1431 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1432 return -EFAULT; 1433 1434 if (batch >= htab->n_buckets) 1435 return -ENOENT; 1436 1437 key_size = htab->map.key_size; 1438 roundup_key_size = round_up(htab->map.key_size, 8); 1439 value_size = htab->map.value_size; 1440 size = round_up(value_size, 8); 1441 if (is_percpu) 1442 value_size = size * num_possible_cpus(); 1443 total = 0; 1444 /* while experimenting with hash tables with sizes ranging from 10 to 1445 * 1000, it was observed that a bucket can have upto 5 entries. 1446 */ 1447 bucket_size = 5; 1448 1449 alloc: 1450 /* We cannot do copy_from_user or copy_to_user inside 1451 * the rcu_read_lock. Allocate enough space here. 1452 */ 1453 keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); 1454 values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); 1455 if (!keys || !values) { 1456 ret = -ENOMEM; 1457 goto after_loop; 1458 } 1459 1460 again: 1461 bpf_disable_instrumentation(); 1462 rcu_read_lock(); 1463 again_nocopy: 1464 dst_key = keys; 1465 dst_val = values; 1466 b = &htab->buckets[batch]; 1467 head = &b->head; 1468 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1469 if (locked) { 1470 ret = htab_lock_bucket(htab, b, batch, &flags); 1471 if (ret) 1472 goto next_batch; 1473 } 1474 1475 bucket_cnt = 0; 1476 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1477 bucket_cnt++; 1478 1479 if (bucket_cnt && !locked) { 1480 locked = true; 1481 goto again_nocopy; 1482 } 1483 1484 if (bucket_cnt > (max_count - total)) { 1485 if (total == 0) 1486 ret = -ENOSPC; 1487 /* Note that since bucket_cnt > 0 here, it is implicit 1488 * that the locked was grabbed, so release it. 1489 */ 1490 htab_unlock_bucket(htab, b, batch, flags); 1491 rcu_read_unlock(); 1492 bpf_enable_instrumentation(); 1493 goto after_loop; 1494 } 1495 1496 if (bucket_cnt > bucket_size) { 1497 bucket_size = bucket_cnt; 1498 /* Note that since bucket_cnt > 0 here, it is implicit 1499 * that the locked was grabbed, so release it. 1500 */ 1501 htab_unlock_bucket(htab, b, batch, flags); 1502 rcu_read_unlock(); 1503 bpf_enable_instrumentation(); 1504 kvfree(keys); 1505 kvfree(values); 1506 goto alloc; 1507 } 1508 1509 /* Next block is only safe to run if you have grabbed the lock */ 1510 if (!locked) 1511 goto next_batch; 1512 1513 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1514 memcpy(dst_key, l->key, key_size); 1515 1516 if (is_percpu) { 1517 int off = 0, cpu; 1518 void __percpu *pptr; 1519 1520 pptr = htab_elem_get_ptr(l, map->key_size); 1521 for_each_possible_cpu(cpu) { 1522 bpf_long_memcpy(dst_val + off, 1523 per_cpu_ptr(pptr, cpu), size); 1524 off += size; 1525 } 1526 } else { 1527 value = l->key + roundup_key_size; 1528 if (elem_map_flags & BPF_F_LOCK) 1529 copy_map_value_locked(map, dst_val, value, 1530 true); 1531 else 1532 copy_map_value(map, dst_val, value); 1533 check_and_init_map_lock(map, dst_val); 1534 } 1535 if (do_delete) { 1536 hlist_nulls_del_rcu(&l->hash_node); 1537 1538 /* bpf_lru_push_free() will acquire lru_lock, which 1539 * may cause deadlock. See comments in function 1540 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1541 * after releasing the bucket lock. 1542 */ 1543 if (is_lru_map) { 1544 l->batch_flink = node_to_free; 1545 node_to_free = l; 1546 } else { 1547 free_htab_elem(htab, l); 1548 } 1549 } 1550 dst_key += key_size; 1551 dst_val += value_size; 1552 } 1553 1554 htab_unlock_bucket(htab, b, batch, flags); 1555 locked = false; 1556 1557 while (node_to_free) { 1558 l = node_to_free; 1559 node_to_free = node_to_free->batch_flink; 1560 bpf_lru_push_free(&htab->lru, &l->lru_node); 1561 } 1562 1563 next_batch: 1564 /* If we are not copying data, we can go to next bucket and avoid 1565 * unlocking the rcu. 1566 */ 1567 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1568 batch++; 1569 goto again_nocopy; 1570 } 1571 1572 rcu_read_unlock(); 1573 bpf_enable_instrumentation(); 1574 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1575 key_size * bucket_cnt) || 1576 copy_to_user(uvalues + total * value_size, values, 1577 value_size * bucket_cnt))) { 1578 ret = -EFAULT; 1579 goto after_loop; 1580 } 1581 1582 total += bucket_cnt; 1583 batch++; 1584 if (batch >= htab->n_buckets) { 1585 ret = -ENOENT; 1586 goto after_loop; 1587 } 1588 goto again; 1589 1590 after_loop: 1591 if (ret == -EFAULT) 1592 goto out; 1593 1594 /* copy # of entries and next batch */ 1595 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1596 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1597 put_user(total, &uattr->batch.count)) 1598 ret = -EFAULT; 1599 1600 out: 1601 kvfree(keys); 1602 kvfree(values); 1603 return ret; 1604 } 1605 1606 static int 1607 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1608 union bpf_attr __user *uattr) 1609 { 1610 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1611 false, true); 1612 } 1613 1614 static int 1615 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1616 const union bpf_attr *attr, 1617 union bpf_attr __user *uattr) 1618 { 1619 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1620 false, true); 1621 } 1622 1623 static int 1624 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1625 union bpf_attr __user *uattr) 1626 { 1627 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1628 false, false); 1629 } 1630 1631 static int 1632 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1633 const union bpf_attr *attr, 1634 union bpf_attr __user *uattr) 1635 { 1636 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1637 false, false); 1638 } 1639 1640 static int 1641 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1642 const union bpf_attr *attr, 1643 union bpf_attr __user *uattr) 1644 { 1645 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1646 true, true); 1647 } 1648 1649 static int 1650 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1651 const union bpf_attr *attr, 1652 union bpf_attr __user *uattr) 1653 { 1654 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1655 true, true); 1656 } 1657 1658 static int 1659 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1660 union bpf_attr __user *uattr) 1661 { 1662 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1663 true, false); 1664 } 1665 1666 static int 1667 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1668 const union bpf_attr *attr, 1669 union bpf_attr __user *uattr) 1670 { 1671 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1672 true, false); 1673 } 1674 1675 struct bpf_iter_seq_hash_map_info { 1676 struct bpf_map *map; 1677 struct bpf_htab *htab; 1678 void *percpu_value_buf; // non-zero means percpu hash 1679 u32 bucket_id; 1680 u32 skip_elems; 1681 }; 1682 1683 static struct htab_elem * 1684 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, 1685 struct htab_elem *prev_elem) 1686 { 1687 const struct bpf_htab *htab = info->htab; 1688 u32 skip_elems = info->skip_elems; 1689 u32 bucket_id = info->bucket_id; 1690 struct hlist_nulls_head *head; 1691 struct hlist_nulls_node *n; 1692 struct htab_elem *elem; 1693 struct bucket *b; 1694 u32 i, count; 1695 1696 if (bucket_id >= htab->n_buckets) 1697 return NULL; 1698 1699 /* try to find next elem in the same bucket */ 1700 if (prev_elem) { 1701 /* no update/deletion on this bucket, prev_elem should be still valid 1702 * and we won't skip elements. 1703 */ 1704 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); 1705 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); 1706 if (elem) 1707 return elem; 1708 1709 /* not found, unlock and go to the next bucket */ 1710 b = &htab->buckets[bucket_id++]; 1711 rcu_read_unlock(); 1712 skip_elems = 0; 1713 } 1714 1715 for (i = bucket_id; i < htab->n_buckets; i++) { 1716 b = &htab->buckets[i]; 1717 rcu_read_lock(); 1718 1719 count = 0; 1720 head = &b->head; 1721 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 1722 if (count >= skip_elems) { 1723 info->bucket_id = i; 1724 info->skip_elems = count; 1725 return elem; 1726 } 1727 count++; 1728 } 1729 1730 rcu_read_unlock(); 1731 skip_elems = 0; 1732 } 1733 1734 info->bucket_id = i; 1735 info->skip_elems = 0; 1736 return NULL; 1737 } 1738 1739 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) 1740 { 1741 struct bpf_iter_seq_hash_map_info *info = seq->private; 1742 struct htab_elem *elem; 1743 1744 elem = bpf_hash_map_seq_find_next(info, NULL); 1745 if (!elem) 1746 return NULL; 1747 1748 if (*pos == 0) 1749 ++*pos; 1750 return elem; 1751 } 1752 1753 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1754 { 1755 struct bpf_iter_seq_hash_map_info *info = seq->private; 1756 1757 ++*pos; 1758 ++info->skip_elems; 1759 return bpf_hash_map_seq_find_next(info, v); 1760 } 1761 1762 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) 1763 { 1764 struct bpf_iter_seq_hash_map_info *info = seq->private; 1765 u32 roundup_key_size, roundup_value_size; 1766 struct bpf_iter__bpf_map_elem ctx = {}; 1767 struct bpf_map *map = info->map; 1768 struct bpf_iter_meta meta; 1769 int ret = 0, off = 0, cpu; 1770 struct bpf_prog *prog; 1771 void __percpu *pptr; 1772 1773 meta.seq = seq; 1774 prog = bpf_iter_get_info(&meta, elem == NULL); 1775 if (prog) { 1776 ctx.meta = &meta; 1777 ctx.map = info->map; 1778 if (elem) { 1779 roundup_key_size = round_up(map->key_size, 8); 1780 ctx.key = elem->key; 1781 if (!info->percpu_value_buf) { 1782 ctx.value = elem->key + roundup_key_size; 1783 } else { 1784 roundup_value_size = round_up(map->value_size, 8); 1785 pptr = htab_elem_get_ptr(elem, map->key_size); 1786 for_each_possible_cpu(cpu) { 1787 bpf_long_memcpy(info->percpu_value_buf + off, 1788 per_cpu_ptr(pptr, cpu), 1789 roundup_value_size); 1790 off += roundup_value_size; 1791 } 1792 ctx.value = info->percpu_value_buf; 1793 } 1794 } 1795 ret = bpf_iter_run_prog(prog, &ctx); 1796 } 1797 1798 return ret; 1799 } 1800 1801 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) 1802 { 1803 return __bpf_hash_map_seq_show(seq, v); 1804 } 1805 1806 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) 1807 { 1808 if (!v) 1809 (void)__bpf_hash_map_seq_show(seq, NULL); 1810 else 1811 rcu_read_unlock(); 1812 } 1813 1814 static int bpf_iter_init_hash_map(void *priv_data, 1815 struct bpf_iter_aux_info *aux) 1816 { 1817 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 1818 struct bpf_map *map = aux->map; 1819 void *value_buf; 1820 u32 buf_size; 1821 1822 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 1823 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 1824 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 1825 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 1826 if (!value_buf) 1827 return -ENOMEM; 1828 1829 seq_info->percpu_value_buf = value_buf; 1830 } 1831 1832 seq_info->map = map; 1833 seq_info->htab = container_of(map, struct bpf_htab, map); 1834 return 0; 1835 } 1836 1837 static void bpf_iter_fini_hash_map(void *priv_data) 1838 { 1839 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 1840 1841 kfree(seq_info->percpu_value_buf); 1842 } 1843 1844 static const struct seq_operations bpf_hash_map_seq_ops = { 1845 .start = bpf_hash_map_seq_start, 1846 .next = bpf_hash_map_seq_next, 1847 .stop = bpf_hash_map_seq_stop, 1848 .show = bpf_hash_map_seq_show, 1849 }; 1850 1851 static const struct bpf_iter_seq_info iter_seq_info = { 1852 .seq_ops = &bpf_hash_map_seq_ops, 1853 .init_seq_private = bpf_iter_init_hash_map, 1854 .fini_seq_private = bpf_iter_fini_hash_map, 1855 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), 1856 }; 1857 1858 static int htab_map_btf_id; 1859 const struct bpf_map_ops htab_map_ops = { 1860 .map_meta_equal = bpf_map_meta_equal, 1861 .map_alloc_check = htab_map_alloc_check, 1862 .map_alloc = htab_map_alloc, 1863 .map_free = htab_map_free, 1864 .map_get_next_key = htab_map_get_next_key, 1865 .map_lookup_elem = htab_map_lookup_elem, 1866 .map_update_elem = htab_map_update_elem, 1867 .map_delete_elem = htab_map_delete_elem, 1868 .map_gen_lookup = htab_map_gen_lookup, 1869 .map_seq_show_elem = htab_map_seq_show_elem, 1870 BATCH_OPS(htab), 1871 .map_btf_name = "bpf_htab", 1872 .map_btf_id = &htab_map_btf_id, 1873 .iter_seq_info = &iter_seq_info, 1874 }; 1875 1876 static int htab_lru_map_btf_id; 1877 const struct bpf_map_ops htab_lru_map_ops = { 1878 .map_meta_equal = bpf_map_meta_equal, 1879 .map_alloc_check = htab_map_alloc_check, 1880 .map_alloc = htab_map_alloc, 1881 .map_free = htab_map_free, 1882 .map_get_next_key = htab_map_get_next_key, 1883 .map_lookup_elem = htab_lru_map_lookup_elem, 1884 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 1885 .map_update_elem = htab_lru_map_update_elem, 1886 .map_delete_elem = htab_lru_map_delete_elem, 1887 .map_gen_lookup = htab_lru_map_gen_lookup, 1888 .map_seq_show_elem = htab_map_seq_show_elem, 1889 BATCH_OPS(htab_lru), 1890 .map_btf_name = "bpf_htab", 1891 .map_btf_id = &htab_lru_map_btf_id, 1892 .iter_seq_info = &iter_seq_info, 1893 }; 1894 1895 /* Called from eBPF program */ 1896 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1897 { 1898 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1899 1900 if (l) 1901 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1902 else 1903 return NULL; 1904 } 1905 1906 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1907 { 1908 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1909 1910 if (l) { 1911 bpf_lru_node_set_ref(&l->lru_node); 1912 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1913 } 1914 1915 return NULL; 1916 } 1917 1918 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1919 { 1920 struct htab_elem *l; 1921 void __percpu *pptr; 1922 int ret = -ENOENT; 1923 int cpu, off = 0; 1924 u32 size; 1925 1926 /* per_cpu areas are zero-filled and bpf programs can only 1927 * access 'value_size' of them, so copying rounded areas 1928 * will not leak any kernel data 1929 */ 1930 size = round_up(map->value_size, 8); 1931 rcu_read_lock(); 1932 l = __htab_map_lookup_elem(map, key); 1933 if (!l) 1934 goto out; 1935 /* We do not mark LRU map element here in order to not mess up 1936 * eviction heuristics when user space does a map walk. 1937 */ 1938 pptr = htab_elem_get_ptr(l, map->key_size); 1939 for_each_possible_cpu(cpu) { 1940 bpf_long_memcpy(value + off, 1941 per_cpu_ptr(pptr, cpu), size); 1942 off += size; 1943 } 1944 ret = 0; 1945 out: 1946 rcu_read_unlock(); 1947 return ret; 1948 } 1949 1950 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1951 u64 map_flags) 1952 { 1953 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1954 int ret; 1955 1956 rcu_read_lock(); 1957 if (htab_is_lru(htab)) 1958 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1959 map_flags, true); 1960 else 1961 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1962 true); 1963 rcu_read_unlock(); 1964 1965 return ret; 1966 } 1967 1968 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 1969 struct seq_file *m) 1970 { 1971 struct htab_elem *l; 1972 void __percpu *pptr; 1973 int cpu; 1974 1975 rcu_read_lock(); 1976 1977 l = __htab_map_lookup_elem(map, key); 1978 if (!l) { 1979 rcu_read_unlock(); 1980 return; 1981 } 1982 1983 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1984 seq_puts(m, ": {\n"); 1985 pptr = htab_elem_get_ptr(l, map->key_size); 1986 for_each_possible_cpu(cpu) { 1987 seq_printf(m, "\tcpu%d: ", cpu); 1988 btf_type_seq_show(map->btf, map->btf_value_type_id, 1989 per_cpu_ptr(pptr, cpu), m); 1990 seq_puts(m, "\n"); 1991 } 1992 seq_puts(m, "}\n"); 1993 1994 rcu_read_unlock(); 1995 } 1996 1997 static int htab_percpu_map_btf_id; 1998 const struct bpf_map_ops htab_percpu_map_ops = { 1999 .map_meta_equal = bpf_map_meta_equal, 2000 .map_alloc_check = htab_map_alloc_check, 2001 .map_alloc = htab_map_alloc, 2002 .map_free = htab_map_free, 2003 .map_get_next_key = htab_map_get_next_key, 2004 .map_lookup_elem = htab_percpu_map_lookup_elem, 2005 .map_update_elem = htab_percpu_map_update_elem, 2006 .map_delete_elem = htab_map_delete_elem, 2007 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2008 BATCH_OPS(htab_percpu), 2009 .map_btf_name = "bpf_htab", 2010 .map_btf_id = &htab_percpu_map_btf_id, 2011 .iter_seq_info = &iter_seq_info, 2012 }; 2013 2014 static int htab_lru_percpu_map_btf_id; 2015 const struct bpf_map_ops htab_lru_percpu_map_ops = { 2016 .map_meta_equal = bpf_map_meta_equal, 2017 .map_alloc_check = htab_map_alloc_check, 2018 .map_alloc = htab_map_alloc, 2019 .map_free = htab_map_free, 2020 .map_get_next_key = htab_map_get_next_key, 2021 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 2022 .map_update_elem = htab_lru_percpu_map_update_elem, 2023 .map_delete_elem = htab_lru_map_delete_elem, 2024 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2025 BATCH_OPS(htab_lru_percpu), 2026 .map_btf_name = "bpf_htab", 2027 .map_btf_id = &htab_lru_percpu_map_btf_id, 2028 .iter_seq_info = &iter_seq_info, 2029 }; 2030 2031 static int fd_htab_map_alloc_check(union bpf_attr *attr) 2032 { 2033 if (attr->value_size != sizeof(u32)) 2034 return -EINVAL; 2035 return htab_map_alloc_check(attr); 2036 } 2037 2038 static void fd_htab_map_free(struct bpf_map *map) 2039 { 2040 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2041 struct hlist_nulls_node *n; 2042 struct hlist_nulls_head *head; 2043 struct htab_elem *l; 2044 int i; 2045 2046 for (i = 0; i < htab->n_buckets; i++) { 2047 head = select_bucket(htab, i); 2048 2049 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 2050 void *ptr = fd_htab_map_get_ptr(map, l); 2051 2052 map->ops->map_fd_put_ptr(ptr); 2053 } 2054 } 2055 2056 htab_map_free(map); 2057 } 2058 2059 /* only called from syscall */ 2060 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 2061 { 2062 void **ptr; 2063 int ret = 0; 2064 2065 if (!map->ops->map_fd_sys_lookup_elem) 2066 return -ENOTSUPP; 2067 2068 rcu_read_lock(); 2069 ptr = htab_map_lookup_elem(map, key); 2070 if (ptr) 2071 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 2072 else 2073 ret = -ENOENT; 2074 rcu_read_unlock(); 2075 2076 return ret; 2077 } 2078 2079 /* only called from syscall */ 2080 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 2081 void *key, void *value, u64 map_flags) 2082 { 2083 void *ptr; 2084 int ret; 2085 u32 ufd = *(u32 *)value; 2086 2087 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 2088 if (IS_ERR(ptr)) 2089 return PTR_ERR(ptr); 2090 2091 ret = htab_map_update_elem(map, key, &ptr, map_flags); 2092 if (ret) 2093 map->ops->map_fd_put_ptr(ptr); 2094 2095 return ret; 2096 } 2097 2098 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 2099 { 2100 struct bpf_map *map, *inner_map_meta; 2101 2102 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 2103 if (IS_ERR(inner_map_meta)) 2104 return inner_map_meta; 2105 2106 map = htab_map_alloc(attr); 2107 if (IS_ERR(map)) { 2108 bpf_map_meta_free(inner_map_meta); 2109 return map; 2110 } 2111 2112 map->inner_map_meta = inner_map_meta; 2113 2114 return map; 2115 } 2116 2117 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 2118 { 2119 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 2120 2121 if (!inner_map) 2122 return NULL; 2123 2124 return READ_ONCE(*inner_map); 2125 } 2126 2127 static int htab_of_map_gen_lookup(struct bpf_map *map, 2128 struct bpf_insn *insn_buf) 2129 { 2130 struct bpf_insn *insn = insn_buf; 2131 const int ret = BPF_REG_0; 2132 2133 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 2134 (void *(*)(struct bpf_map *map, void *key))NULL)); 2135 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 2136 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 2137 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 2138 offsetof(struct htab_elem, key) + 2139 round_up(map->key_size, 8)); 2140 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 2141 2142 return insn - insn_buf; 2143 } 2144 2145 static void htab_of_map_free(struct bpf_map *map) 2146 { 2147 bpf_map_meta_free(map->inner_map_meta); 2148 fd_htab_map_free(map); 2149 } 2150 2151 static int htab_of_maps_map_btf_id; 2152 const struct bpf_map_ops htab_of_maps_map_ops = { 2153 .map_alloc_check = fd_htab_map_alloc_check, 2154 .map_alloc = htab_of_map_alloc, 2155 .map_free = htab_of_map_free, 2156 .map_get_next_key = htab_map_get_next_key, 2157 .map_lookup_elem = htab_of_map_lookup_elem, 2158 .map_delete_elem = htab_map_delete_elem, 2159 .map_fd_get_ptr = bpf_map_fd_get_ptr, 2160 .map_fd_put_ptr = bpf_map_fd_put_ptr, 2161 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 2162 .map_gen_lookup = htab_of_map_gen_lookup, 2163 .map_check_btf = map_check_no_btf, 2164 .map_btf_name = "bpf_htab", 2165 .map_btf_id = &htab_of_maps_map_btf_id, 2166 }; 2167