1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/random.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/rcupdate_trace.h> 13 #include "percpu_freelist.h" 14 #include "bpf_lru_list.h" 15 #include "map_in_map.h" 16 17 #define HTAB_CREATE_FLAG_MASK \ 18 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 19 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 20 21 #define BATCH_OPS(_name) \ 22 .map_lookup_batch = \ 23 _name##_map_lookup_batch, \ 24 .map_lookup_and_delete_batch = \ 25 _name##_map_lookup_and_delete_batch, \ 26 .map_update_batch = \ 27 generic_map_update_batch, \ 28 .map_delete_batch = \ 29 generic_map_delete_batch 30 31 /* 32 * The bucket lock has two protection scopes: 33 * 34 * 1) Serializing concurrent operations from BPF programs on differrent 35 * CPUs 36 * 37 * 2) Serializing concurrent operations from BPF programs and sys_bpf() 38 * 39 * BPF programs can execute in any context including perf, kprobes and 40 * tracing. As there are almost no limits where perf, kprobes and tracing 41 * can be invoked from the lock operations need to be protected against 42 * deadlocks. Deadlocks can be caused by recursion and by an invocation in 43 * the lock held section when functions which acquire this lock are invoked 44 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU 45 * variable bpf_prog_active, which prevents BPF programs attached to perf 46 * events, kprobes and tracing to be invoked before the prior invocation 47 * from one of these contexts completed. sys_bpf() uses the same mechanism 48 * by pinning the task to the current CPU and incrementing the recursion 49 * protection accross the map operation. 50 * 51 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain 52 * operations like memory allocations (even with GFP_ATOMIC) from atomic 53 * contexts. This is required because even with GFP_ATOMIC the memory 54 * allocator calls into code pathes which acquire locks with long held lock 55 * sections. To ensure the deterministic behaviour these locks are regular 56 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only 57 * true atomic contexts on an RT kernel are the low level hardware 58 * handling, scheduling, low level interrupt handling, NMIs etc. None of 59 * these contexts should ever do memory allocations. 60 * 61 * As regular device interrupt handlers and soft interrupts are forced into 62 * thread context, the existing code which does 63 * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); 64 * just works. 65 * 66 * In theory the BPF locks could be converted to regular spinlocks as well, 67 * but the bucket locks and percpu_freelist locks can be taken from 68 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be 69 * atomic contexts even on RT. These mechanisms require preallocated maps, 70 * so there is no need to invoke memory allocations within the lock held 71 * sections. 72 * 73 * BPF maps which need dynamic allocation are only used from (forced) 74 * thread context on RT and can therefore use regular spinlocks which in 75 * turn allows to invoke memory allocations from the lock held section. 76 * 77 * On a non RT kernel this distinction is neither possible nor required. 78 * spinlock maps to raw_spinlock and the extra code is optimized out by the 79 * compiler. 80 */ 81 struct bucket { 82 struct hlist_nulls_head head; 83 union { 84 raw_spinlock_t raw_lock; 85 spinlock_t lock; 86 }; 87 }; 88 89 #define HASHTAB_MAP_LOCK_COUNT 8 90 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) 91 92 struct bpf_htab { 93 struct bpf_map map; 94 struct bucket *buckets; 95 void *elems; 96 union { 97 struct pcpu_freelist freelist; 98 struct bpf_lru lru; 99 }; 100 struct htab_elem *__percpu *extra_elems; 101 atomic_t count; /* number of elements in this hashtable */ 102 u32 n_buckets; /* number of hash buckets */ 103 u32 elem_size; /* size of each element in bytes */ 104 u32 hashrnd; 105 struct lock_class_key lockdep_key; 106 int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 107 }; 108 109 /* each htab element is struct htab_elem + key + value */ 110 struct htab_elem { 111 union { 112 struct hlist_nulls_node hash_node; 113 struct { 114 void *padding; 115 union { 116 struct bpf_htab *htab; 117 struct pcpu_freelist_node fnode; 118 struct htab_elem *batch_flink; 119 }; 120 }; 121 }; 122 union { 123 struct rcu_head rcu; 124 struct bpf_lru_node lru_node; 125 }; 126 u32 hash; 127 char key[] __aligned(8); 128 }; 129 130 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 131 { 132 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 133 } 134 135 static inline bool htab_use_raw_lock(const struct bpf_htab *htab) 136 { 137 return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); 138 } 139 140 static void htab_init_buckets(struct bpf_htab *htab) 141 { 142 unsigned i; 143 144 for (i = 0; i < htab->n_buckets; i++) { 145 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 146 if (htab_use_raw_lock(htab)) { 147 raw_spin_lock_init(&htab->buckets[i].raw_lock); 148 lockdep_set_class(&htab->buckets[i].raw_lock, 149 &htab->lockdep_key); 150 } else { 151 spin_lock_init(&htab->buckets[i].lock); 152 lockdep_set_class(&htab->buckets[i].lock, 153 &htab->lockdep_key); 154 } 155 } 156 } 157 158 static inline int htab_lock_bucket(const struct bpf_htab *htab, 159 struct bucket *b, u32 hash, 160 unsigned long *pflags) 161 { 162 unsigned long flags; 163 164 hash = hash & HASHTAB_MAP_LOCK_MASK; 165 166 migrate_disable(); 167 if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 168 __this_cpu_dec(*(htab->map_locked[hash])); 169 migrate_enable(); 170 return -EBUSY; 171 } 172 173 if (htab_use_raw_lock(htab)) 174 raw_spin_lock_irqsave(&b->raw_lock, flags); 175 else 176 spin_lock_irqsave(&b->lock, flags); 177 *pflags = flags; 178 179 return 0; 180 } 181 182 static inline void htab_unlock_bucket(const struct bpf_htab *htab, 183 struct bucket *b, u32 hash, 184 unsigned long flags) 185 { 186 hash = hash & HASHTAB_MAP_LOCK_MASK; 187 if (htab_use_raw_lock(htab)) 188 raw_spin_unlock_irqrestore(&b->raw_lock, flags); 189 else 190 spin_unlock_irqrestore(&b->lock, flags); 191 __this_cpu_dec(*(htab->map_locked[hash])); 192 migrate_enable(); 193 } 194 195 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 196 197 static bool htab_is_lru(const struct bpf_htab *htab) 198 { 199 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 200 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 201 } 202 203 static bool htab_is_percpu(const struct bpf_htab *htab) 204 { 205 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 206 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 207 } 208 209 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 210 void __percpu *pptr) 211 { 212 *(void __percpu **)(l->key + key_size) = pptr; 213 } 214 215 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 216 { 217 return *(void __percpu **)(l->key + key_size); 218 } 219 220 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 221 { 222 return *(void **)(l->key + roundup(map->key_size, 8)); 223 } 224 225 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 226 { 227 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 228 } 229 230 static void htab_free_elems(struct bpf_htab *htab) 231 { 232 int i; 233 234 if (!htab_is_percpu(htab)) 235 goto free_elems; 236 237 for (i = 0; i < htab->map.max_entries; i++) { 238 void __percpu *pptr; 239 240 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 241 htab->map.key_size); 242 free_percpu(pptr); 243 cond_resched(); 244 } 245 free_elems: 246 bpf_map_area_free(htab->elems); 247 } 248 249 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 250 * (bucket_lock). If both locks need to be acquired together, the lock 251 * order is always lru_lock -> bucket_lock and this only happens in 252 * bpf_lru_list.c logic. For example, certain code path of 253 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 254 * will acquire lru_lock first followed by acquiring bucket_lock. 255 * 256 * In hashtab.c, to avoid deadlock, lock acquisition of 257 * bucket_lock followed by lru_lock is not allowed. In such cases, 258 * bucket_lock needs to be released first before acquiring lru_lock. 259 */ 260 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 261 u32 hash) 262 { 263 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 264 struct htab_elem *l; 265 266 if (node) { 267 l = container_of(node, struct htab_elem, lru_node); 268 memcpy(l->key, key, htab->map.key_size); 269 return l; 270 } 271 272 return NULL; 273 } 274 275 static int prealloc_init(struct bpf_htab *htab) 276 { 277 u32 num_entries = htab->map.max_entries; 278 int err = -ENOMEM, i; 279 280 if (!htab_is_percpu(htab) && !htab_is_lru(htab)) 281 num_entries += num_possible_cpus(); 282 283 htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, 284 htab->map.numa_node); 285 if (!htab->elems) 286 return -ENOMEM; 287 288 if (!htab_is_percpu(htab)) 289 goto skip_percpu_elems; 290 291 for (i = 0; i < num_entries; i++) { 292 u32 size = round_up(htab->map.value_size, 8); 293 void __percpu *pptr; 294 295 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 296 if (!pptr) 297 goto free_elems; 298 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 299 pptr); 300 cond_resched(); 301 } 302 303 skip_percpu_elems: 304 if (htab_is_lru(htab)) 305 err = bpf_lru_init(&htab->lru, 306 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 307 offsetof(struct htab_elem, hash) - 308 offsetof(struct htab_elem, lru_node), 309 htab_lru_map_delete_node, 310 htab); 311 else 312 err = pcpu_freelist_init(&htab->freelist); 313 314 if (err) 315 goto free_elems; 316 317 if (htab_is_lru(htab)) 318 bpf_lru_populate(&htab->lru, htab->elems, 319 offsetof(struct htab_elem, lru_node), 320 htab->elem_size, num_entries); 321 else 322 pcpu_freelist_populate(&htab->freelist, 323 htab->elems + offsetof(struct htab_elem, fnode), 324 htab->elem_size, num_entries); 325 326 return 0; 327 328 free_elems: 329 htab_free_elems(htab); 330 return err; 331 } 332 333 static void prealloc_destroy(struct bpf_htab *htab) 334 { 335 htab_free_elems(htab); 336 337 if (htab_is_lru(htab)) 338 bpf_lru_destroy(&htab->lru); 339 else 340 pcpu_freelist_destroy(&htab->freelist); 341 } 342 343 static int alloc_extra_elems(struct bpf_htab *htab) 344 { 345 struct htab_elem *__percpu *pptr, *l_new; 346 struct pcpu_freelist_node *l; 347 int cpu; 348 349 pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, 350 GFP_USER | __GFP_NOWARN); 351 if (!pptr) 352 return -ENOMEM; 353 354 for_each_possible_cpu(cpu) { 355 l = pcpu_freelist_pop(&htab->freelist); 356 /* pop will succeed, since prealloc_init() 357 * preallocated extra num_possible_cpus elements 358 */ 359 l_new = container_of(l, struct htab_elem, fnode); 360 *per_cpu_ptr(pptr, cpu) = l_new; 361 } 362 htab->extra_elems = pptr; 363 return 0; 364 } 365 366 /* Called from syscall */ 367 static int htab_map_alloc_check(union bpf_attr *attr) 368 { 369 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 370 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 371 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 372 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 373 /* percpu_lru means each cpu has its own LRU list. 374 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 375 * the map's value itself is percpu. percpu_lru has 376 * nothing to do with the map's value. 377 */ 378 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 379 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 380 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 381 int numa_node = bpf_map_attr_numa_node(attr); 382 383 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 384 offsetof(struct htab_elem, hash_node.pprev)); 385 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 386 offsetof(struct htab_elem, hash_node.pprev)); 387 388 if (lru && !bpf_capable()) 389 /* LRU implementation is much complicated than other 390 * maps. Hence, limit to CAP_BPF. 391 */ 392 return -EPERM; 393 394 if (zero_seed && !capable(CAP_SYS_ADMIN)) 395 /* Guard against local DoS, and discourage production use. */ 396 return -EPERM; 397 398 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 399 !bpf_map_flags_access_ok(attr->map_flags)) 400 return -EINVAL; 401 402 if (!lru && percpu_lru) 403 return -EINVAL; 404 405 if (lru && !prealloc) 406 return -ENOTSUPP; 407 408 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 409 return -EINVAL; 410 411 /* check sanity of attributes. 412 * value_size == 0 may be allowed in the future to use map as a set 413 */ 414 if (attr->max_entries == 0 || attr->key_size == 0 || 415 attr->value_size == 0) 416 return -EINVAL; 417 418 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - 419 sizeof(struct htab_elem)) 420 /* if key_size + value_size is bigger, the user space won't be 421 * able to access the elements via bpf syscall. This check 422 * also makes sure that the elem_size doesn't overflow and it's 423 * kmalloc-able later in htab_map_update_elem() 424 */ 425 return -E2BIG; 426 427 return 0; 428 } 429 430 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 431 { 432 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 433 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 434 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 435 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 436 /* percpu_lru means each cpu has its own LRU list. 437 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 438 * the map's value itself is percpu. percpu_lru has 439 * nothing to do with the map's value. 440 */ 441 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 442 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 443 struct bpf_htab *htab; 444 int err, i; 445 u64 cost; 446 447 htab = kzalloc(sizeof(*htab), GFP_USER); 448 if (!htab) 449 return ERR_PTR(-ENOMEM); 450 451 lockdep_register_key(&htab->lockdep_key); 452 453 bpf_map_init_from_attr(&htab->map, attr); 454 455 if (percpu_lru) { 456 /* ensure each CPU's lru list has >=1 elements. 457 * since we are at it, make each lru list has the same 458 * number of elements. 459 */ 460 htab->map.max_entries = roundup(attr->max_entries, 461 num_possible_cpus()); 462 if (htab->map.max_entries < attr->max_entries) 463 htab->map.max_entries = rounddown(attr->max_entries, 464 num_possible_cpus()); 465 } 466 467 /* hash table size must be power of 2 */ 468 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 469 470 htab->elem_size = sizeof(struct htab_elem) + 471 round_up(htab->map.key_size, 8); 472 if (percpu) 473 htab->elem_size += sizeof(void *); 474 else 475 htab->elem_size += round_up(htab->map.value_size, 8); 476 477 err = -E2BIG; 478 /* prevent zero size kmalloc and check for u32 overflow */ 479 if (htab->n_buckets == 0 || 480 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 481 goto free_htab; 482 483 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 484 (u64) htab->elem_size * htab->map.max_entries; 485 486 if (percpu) 487 cost += (u64) round_up(htab->map.value_size, 8) * 488 num_possible_cpus() * htab->map.max_entries; 489 else 490 cost += (u64) htab->elem_size * num_possible_cpus(); 491 492 /* if map size is larger than memlock limit, reject it */ 493 err = bpf_map_charge_init(&htab->map.memory, cost); 494 if (err) 495 goto free_htab; 496 497 err = -ENOMEM; 498 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 499 sizeof(struct bucket), 500 htab->map.numa_node); 501 if (!htab->buckets) 502 goto free_charge; 503 504 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 505 htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), 506 sizeof(int), GFP_USER); 507 if (!htab->map_locked[i]) 508 goto free_map_locked; 509 } 510 511 if (htab->map.map_flags & BPF_F_ZERO_SEED) 512 htab->hashrnd = 0; 513 else 514 htab->hashrnd = get_random_int(); 515 516 htab_init_buckets(htab); 517 518 if (prealloc) { 519 err = prealloc_init(htab); 520 if (err) 521 goto free_map_locked; 522 523 if (!percpu && !lru) { 524 /* lru itself can remove the least used element, so 525 * there is no need for an extra elem during map_update. 526 */ 527 err = alloc_extra_elems(htab); 528 if (err) 529 goto free_prealloc; 530 } 531 } 532 533 return &htab->map; 534 535 free_prealloc: 536 prealloc_destroy(htab); 537 free_map_locked: 538 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 539 free_percpu(htab->map_locked[i]); 540 bpf_map_area_free(htab->buckets); 541 free_charge: 542 bpf_map_charge_finish(&htab->map.memory); 543 free_htab: 544 lockdep_unregister_key(&htab->lockdep_key); 545 kfree(htab); 546 return ERR_PTR(err); 547 } 548 549 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 550 { 551 return jhash(key, key_len, hashrnd); 552 } 553 554 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 555 { 556 return &htab->buckets[hash & (htab->n_buckets - 1)]; 557 } 558 559 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 560 { 561 return &__select_bucket(htab, hash)->head; 562 } 563 564 /* this lookup function can only be called with bucket lock taken */ 565 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 566 void *key, u32 key_size) 567 { 568 struct hlist_nulls_node *n; 569 struct htab_elem *l; 570 571 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 572 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 573 return l; 574 575 return NULL; 576 } 577 578 /* can be called without bucket lock. it will repeat the loop in 579 * the unlikely event when elements moved from one bucket into another 580 * while link list is being walked 581 */ 582 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 583 u32 hash, void *key, 584 u32 key_size, u32 n_buckets) 585 { 586 struct hlist_nulls_node *n; 587 struct htab_elem *l; 588 589 again: 590 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 591 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 592 return l; 593 594 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 595 goto again; 596 597 return NULL; 598 } 599 600 /* Called from syscall or from eBPF program directly, so 601 * arguments have to match bpf_map_lookup_elem() exactly. 602 * The return value is adjusted by BPF instructions 603 * in htab_map_gen_lookup(). 604 */ 605 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 606 { 607 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 608 struct hlist_nulls_head *head; 609 struct htab_elem *l; 610 u32 hash, key_size; 611 612 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 613 614 key_size = map->key_size; 615 616 hash = htab_map_hash(key, key_size, htab->hashrnd); 617 618 head = select_bucket(htab, hash); 619 620 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 621 622 return l; 623 } 624 625 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 626 { 627 struct htab_elem *l = __htab_map_lookup_elem(map, key); 628 629 if (l) 630 return l->key + round_up(map->key_size, 8); 631 632 return NULL; 633 } 634 635 /* inline bpf_map_lookup_elem() call. 636 * Instead of: 637 * bpf_prog 638 * bpf_map_lookup_elem 639 * map->ops->map_lookup_elem 640 * htab_map_lookup_elem 641 * __htab_map_lookup_elem 642 * do: 643 * bpf_prog 644 * __htab_map_lookup_elem 645 */ 646 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 647 { 648 struct bpf_insn *insn = insn_buf; 649 const int ret = BPF_REG_0; 650 651 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 652 (void *(*)(struct bpf_map *map, void *key))NULL)); 653 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 654 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 655 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 656 offsetof(struct htab_elem, key) + 657 round_up(map->key_size, 8)); 658 return insn - insn_buf; 659 } 660 661 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 662 void *key, const bool mark) 663 { 664 struct htab_elem *l = __htab_map_lookup_elem(map, key); 665 666 if (l) { 667 if (mark) 668 bpf_lru_node_set_ref(&l->lru_node); 669 return l->key + round_up(map->key_size, 8); 670 } 671 672 return NULL; 673 } 674 675 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 676 { 677 return __htab_lru_map_lookup_elem(map, key, true); 678 } 679 680 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 681 { 682 return __htab_lru_map_lookup_elem(map, key, false); 683 } 684 685 static int htab_lru_map_gen_lookup(struct bpf_map *map, 686 struct bpf_insn *insn_buf) 687 { 688 struct bpf_insn *insn = insn_buf; 689 const int ret = BPF_REG_0; 690 const int ref_reg = BPF_REG_1; 691 692 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 693 (void *(*)(struct bpf_map *map, void *key))NULL)); 694 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 695 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 696 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 697 offsetof(struct htab_elem, lru_node) + 698 offsetof(struct bpf_lru_node, ref)); 699 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 700 *insn++ = BPF_ST_MEM(BPF_B, ret, 701 offsetof(struct htab_elem, lru_node) + 702 offsetof(struct bpf_lru_node, ref), 703 1); 704 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 705 offsetof(struct htab_elem, key) + 706 round_up(map->key_size, 8)); 707 return insn - insn_buf; 708 } 709 710 /* It is called from the bpf_lru_list when the LRU needs to delete 711 * older elements from the htab. 712 */ 713 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 714 { 715 struct bpf_htab *htab = (struct bpf_htab *)arg; 716 struct htab_elem *l = NULL, *tgt_l; 717 struct hlist_nulls_head *head; 718 struct hlist_nulls_node *n; 719 unsigned long flags; 720 struct bucket *b; 721 int ret; 722 723 tgt_l = container_of(node, struct htab_elem, lru_node); 724 b = __select_bucket(htab, tgt_l->hash); 725 head = &b->head; 726 727 ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 728 if (ret) 729 return false; 730 731 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 732 if (l == tgt_l) { 733 hlist_nulls_del_rcu(&l->hash_node); 734 break; 735 } 736 737 htab_unlock_bucket(htab, b, tgt_l->hash, flags); 738 739 return l == tgt_l; 740 } 741 742 /* Called from syscall */ 743 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 744 { 745 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 746 struct hlist_nulls_head *head; 747 struct htab_elem *l, *next_l; 748 u32 hash, key_size; 749 int i = 0; 750 751 WARN_ON_ONCE(!rcu_read_lock_held()); 752 753 key_size = map->key_size; 754 755 if (!key) 756 goto find_first_elem; 757 758 hash = htab_map_hash(key, key_size, htab->hashrnd); 759 760 head = select_bucket(htab, hash); 761 762 /* lookup the key */ 763 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 764 765 if (!l) 766 goto find_first_elem; 767 768 /* key was found, get next key in the same bucket */ 769 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 770 struct htab_elem, hash_node); 771 772 if (next_l) { 773 /* if next elem in this hash list is non-zero, just return it */ 774 memcpy(next_key, next_l->key, key_size); 775 return 0; 776 } 777 778 /* no more elements in this hash list, go to the next bucket */ 779 i = hash & (htab->n_buckets - 1); 780 i++; 781 782 find_first_elem: 783 /* iterate over buckets */ 784 for (; i < htab->n_buckets; i++) { 785 head = select_bucket(htab, i); 786 787 /* pick first element in the bucket */ 788 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 789 struct htab_elem, hash_node); 790 if (next_l) { 791 /* if it's not empty, just return it */ 792 memcpy(next_key, next_l->key, key_size); 793 return 0; 794 } 795 } 796 797 /* iterated over all buckets and all elements */ 798 return -ENOENT; 799 } 800 801 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 802 { 803 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 804 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 805 kfree(l); 806 } 807 808 static void htab_elem_free_rcu(struct rcu_head *head) 809 { 810 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 811 struct bpf_htab *htab = l->htab; 812 813 htab_elem_free(htab, l); 814 } 815 816 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) 817 { 818 struct bpf_map *map = &htab->map; 819 void *ptr; 820 821 if (map->ops->map_fd_put_ptr) { 822 ptr = fd_htab_map_get_ptr(map, l); 823 map->ops->map_fd_put_ptr(ptr); 824 } 825 } 826 827 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 828 { 829 htab_put_fd_value(htab, l); 830 831 if (htab_is_prealloc(htab)) { 832 __pcpu_freelist_push(&htab->freelist, &l->fnode); 833 } else { 834 atomic_dec(&htab->count); 835 l->htab = htab; 836 call_rcu(&l->rcu, htab_elem_free_rcu); 837 } 838 } 839 840 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 841 void *value, bool onallcpus) 842 { 843 if (!onallcpus) { 844 /* copy true value_size bytes */ 845 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 846 } else { 847 u32 size = round_up(htab->map.value_size, 8); 848 int off = 0, cpu; 849 850 for_each_possible_cpu(cpu) { 851 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 852 value + off, size); 853 off += size; 854 } 855 } 856 } 857 858 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, 859 void *value, bool onallcpus) 860 { 861 /* When using prealloc and not setting the initial value on all cpus, 862 * zero-fill element values for other cpus (just as what happens when 863 * not using prealloc). Otherwise, bpf program has no way to ensure 864 * known initial values for cpus other than current one 865 * (onallcpus=false always when coming from bpf prog). 866 */ 867 if (htab_is_prealloc(htab) && !onallcpus) { 868 u32 size = round_up(htab->map.value_size, 8); 869 int current_cpu = raw_smp_processor_id(); 870 int cpu; 871 872 for_each_possible_cpu(cpu) { 873 if (cpu == current_cpu) 874 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, 875 size); 876 else 877 memset(per_cpu_ptr(pptr, cpu), 0, size); 878 } 879 } else { 880 pcpu_copy_value(htab, pptr, value, onallcpus); 881 } 882 } 883 884 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 885 { 886 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 887 BITS_PER_LONG == 64; 888 } 889 890 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 891 void *value, u32 key_size, u32 hash, 892 bool percpu, bool onallcpus, 893 struct htab_elem *old_elem) 894 { 895 u32 size = htab->map.value_size; 896 bool prealloc = htab_is_prealloc(htab); 897 struct htab_elem *l_new, **pl_new; 898 void __percpu *pptr; 899 900 if (prealloc) { 901 if (old_elem) { 902 /* if we're updating the existing element, 903 * use per-cpu extra elems to avoid freelist_pop/push 904 */ 905 pl_new = this_cpu_ptr(htab->extra_elems); 906 l_new = *pl_new; 907 htab_put_fd_value(htab, old_elem); 908 *pl_new = old_elem; 909 } else { 910 struct pcpu_freelist_node *l; 911 912 l = __pcpu_freelist_pop(&htab->freelist); 913 if (!l) 914 return ERR_PTR(-E2BIG); 915 l_new = container_of(l, struct htab_elem, fnode); 916 } 917 } else { 918 if (atomic_inc_return(&htab->count) > htab->map.max_entries) 919 if (!old_elem) { 920 /* when map is full and update() is replacing 921 * old element, it's ok to allocate, since 922 * old element will be freed immediately. 923 * Otherwise return an error 924 */ 925 l_new = ERR_PTR(-E2BIG); 926 goto dec_count; 927 } 928 l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, 929 htab->map.numa_node); 930 if (!l_new) { 931 l_new = ERR_PTR(-ENOMEM); 932 goto dec_count; 933 } 934 check_and_init_map_lock(&htab->map, 935 l_new->key + round_up(key_size, 8)); 936 } 937 938 memcpy(l_new->key, key, key_size); 939 if (percpu) { 940 size = round_up(size, 8); 941 if (prealloc) { 942 pptr = htab_elem_get_ptr(l_new, key_size); 943 } else { 944 /* alloc_percpu zero-fills */ 945 pptr = __alloc_percpu_gfp(size, 8, 946 GFP_ATOMIC | __GFP_NOWARN); 947 if (!pptr) { 948 kfree(l_new); 949 l_new = ERR_PTR(-ENOMEM); 950 goto dec_count; 951 } 952 } 953 954 pcpu_init_value(htab, pptr, value, onallcpus); 955 956 if (!prealloc) 957 htab_elem_set_ptr(l_new, key_size, pptr); 958 } else if (fd_htab_map_needs_adjust(htab)) { 959 size = round_up(size, 8); 960 memcpy(l_new->key + round_up(key_size, 8), value, size); 961 } else { 962 copy_map_value(&htab->map, 963 l_new->key + round_up(key_size, 8), 964 value); 965 } 966 967 l_new->hash = hash; 968 return l_new; 969 dec_count: 970 atomic_dec(&htab->count); 971 return l_new; 972 } 973 974 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 975 u64 map_flags) 976 { 977 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 978 /* elem already exists */ 979 return -EEXIST; 980 981 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 982 /* elem doesn't exist, cannot update it */ 983 return -ENOENT; 984 985 return 0; 986 } 987 988 /* Called from syscall or from eBPF program */ 989 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 990 u64 map_flags) 991 { 992 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 993 struct htab_elem *l_new = NULL, *l_old; 994 struct hlist_nulls_head *head; 995 unsigned long flags; 996 struct bucket *b; 997 u32 key_size, hash; 998 int ret; 999 1000 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 1001 /* unknown flags */ 1002 return -EINVAL; 1003 1004 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1005 1006 key_size = map->key_size; 1007 1008 hash = htab_map_hash(key, key_size, htab->hashrnd); 1009 1010 b = __select_bucket(htab, hash); 1011 head = &b->head; 1012 1013 if (unlikely(map_flags & BPF_F_LOCK)) { 1014 if (unlikely(!map_value_has_spin_lock(map))) 1015 return -EINVAL; 1016 /* find an element without taking the bucket lock */ 1017 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 1018 htab->n_buckets); 1019 ret = check_flags(htab, l_old, map_flags); 1020 if (ret) 1021 return ret; 1022 if (l_old) { 1023 /* grab the element lock and update value in place */ 1024 copy_map_value_locked(map, 1025 l_old->key + round_up(key_size, 8), 1026 value, false); 1027 return 0; 1028 } 1029 /* fall through, grab the bucket lock and lookup again. 1030 * 99.9% chance that the element won't be found, 1031 * but second lookup under lock has to be done. 1032 */ 1033 } 1034 1035 ret = htab_lock_bucket(htab, b, hash, &flags); 1036 if (ret) 1037 return ret; 1038 1039 l_old = lookup_elem_raw(head, hash, key, key_size); 1040 1041 ret = check_flags(htab, l_old, map_flags); 1042 if (ret) 1043 goto err; 1044 1045 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 1046 /* first lookup without the bucket lock didn't find the element, 1047 * but second lookup with the bucket lock found it. 1048 * This case is highly unlikely, but has to be dealt with: 1049 * grab the element lock in addition to the bucket lock 1050 * and update element in place 1051 */ 1052 copy_map_value_locked(map, 1053 l_old->key + round_up(key_size, 8), 1054 value, false); 1055 ret = 0; 1056 goto err; 1057 } 1058 1059 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 1060 l_old); 1061 if (IS_ERR(l_new)) { 1062 /* all pre-allocated elements are in use or memory exhausted */ 1063 ret = PTR_ERR(l_new); 1064 goto err; 1065 } 1066 1067 /* add new element to the head of the list, so that 1068 * concurrent search will find it before old elem 1069 */ 1070 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1071 if (l_old) { 1072 hlist_nulls_del_rcu(&l_old->hash_node); 1073 if (!htab_is_prealloc(htab)) 1074 free_htab_elem(htab, l_old); 1075 } 1076 ret = 0; 1077 err: 1078 htab_unlock_bucket(htab, b, hash, flags); 1079 return ret; 1080 } 1081 1082 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 1083 u64 map_flags) 1084 { 1085 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1086 struct htab_elem *l_new, *l_old = NULL; 1087 struct hlist_nulls_head *head; 1088 unsigned long flags; 1089 struct bucket *b; 1090 u32 key_size, hash; 1091 int ret; 1092 1093 if (unlikely(map_flags > BPF_EXIST)) 1094 /* unknown flags */ 1095 return -EINVAL; 1096 1097 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1098 1099 key_size = map->key_size; 1100 1101 hash = htab_map_hash(key, key_size, htab->hashrnd); 1102 1103 b = __select_bucket(htab, hash); 1104 head = &b->head; 1105 1106 /* For LRU, we need to alloc before taking bucket's 1107 * spinlock because getting free nodes from LRU may need 1108 * to remove older elements from htab and this removal 1109 * operation will need a bucket lock. 1110 */ 1111 l_new = prealloc_lru_pop(htab, key, hash); 1112 if (!l_new) 1113 return -ENOMEM; 1114 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 1115 1116 ret = htab_lock_bucket(htab, b, hash, &flags); 1117 if (ret) 1118 return ret; 1119 1120 l_old = lookup_elem_raw(head, hash, key, key_size); 1121 1122 ret = check_flags(htab, l_old, map_flags); 1123 if (ret) 1124 goto err; 1125 1126 /* add new element to the head of the list, so that 1127 * concurrent search will find it before old elem 1128 */ 1129 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1130 if (l_old) { 1131 bpf_lru_node_set_ref(&l_new->lru_node); 1132 hlist_nulls_del_rcu(&l_old->hash_node); 1133 } 1134 ret = 0; 1135 1136 err: 1137 htab_unlock_bucket(htab, b, hash, flags); 1138 1139 if (ret) 1140 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1141 else if (l_old) 1142 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 1143 1144 return ret; 1145 } 1146 1147 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1148 void *value, u64 map_flags, 1149 bool onallcpus) 1150 { 1151 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1152 struct htab_elem *l_new = NULL, *l_old; 1153 struct hlist_nulls_head *head; 1154 unsigned long flags; 1155 struct bucket *b; 1156 u32 key_size, hash; 1157 int ret; 1158 1159 if (unlikely(map_flags > BPF_EXIST)) 1160 /* unknown flags */ 1161 return -EINVAL; 1162 1163 WARN_ON_ONCE(!rcu_read_lock_held()); 1164 1165 key_size = map->key_size; 1166 1167 hash = htab_map_hash(key, key_size, htab->hashrnd); 1168 1169 b = __select_bucket(htab, hash); 1170 head = &b->head; 1171 1172 ret = htab_lock_bucket(htab, b, hash, &flags); 1173 if (ret) 1174 return ret; 1175 1176 l_old = lookup_elem_raw(head, hash, key, key_size); 1177 1178 ret = check_flags(htab, l_old, map_flags); 1179 if (ret) 1180 goto err; 1181 1182 if (l_old) { 1183 /* per-cpu hash map can update value in-place */ 1184 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1185 value, onallcpus); 1186 } else { 1187 l_new = alloc_htab_elem(htab, key, value, key_size, 1188 hash, true, onallcpus, NULL); 1189 if (IS_ERR(l_new)) { 1190 ret = PTR_ERR(l_new); 1191 goto err; 1192 } 1193 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1194 } 1195 ret = 0; 1196 err: 1197 htab_unlock_bucket(htab, b, hash, flags); 1198 return ret; 1199 } 1200 1201 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1202 void *value, u64 map_flags, 1203 bool onallcpus) 1204 { 1205 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1206 struct htab_elem *l_new = NULL, *l_old; 1207 struct hlist_nulls_head *head; 1208 unsigned long flags; 1209 struct bucket *b; 1210 u32 key_size, hash; 1211 int ret; 1212 1213 if (unlikely(map_flags > BPF_EXIST)) 1214 /* unknown flags */ 1215 return -EINVAL; 1216 1217 WARN_ON_ONCE(!rcu_read_lock_held()); 1218 1219 key_size = map->key_size; 1220 1221 hash = htab_map_hash(key, key_size, htab->hashrnd); 1222 1223 b = __select_bucket(htab, hash); 1224 head = &b->head; 1225 1226 /* For LRU, we need to alloc before taking bucket's 1227 * spinlock because LRU's elem alloc may need 1228 * to remove older elem from htab and this removal 1229 * operation will need a bucket lock. 1230 */ 1231 if (map_flags != BPF_EXIST) { 1232 l_new = prealloc_lru_pop(htab, key, hash); 1233 if (!l_new) 1234 return -ENOMEM; 1235 } 1236 1237 ret = htab_lock_bucket(htab, b, hash, &flags); 1238 if (ret) 1239 return ret; 1240 1241 l_old = lookup_elem_raw(head, hash, key, key_size); 1242 1243 ret = check_flags(htab, l_old, map_flags); 1244 if (ret) 1245 goto err; 1246 1247 if (l_old) { 1248 bpf_lru_node_set_ref(&l_old->lru_node); 1249 1250 /* per-cpu hash map can update value in-place */ 1251 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1252 value, onallcpus); 1253 } else { 1254 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), 1255 value, onallcpus); 1256 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1257 l_new = NULL; 1258 } 1259 ret = 0; 1260 err: 1261 htab_unlock_bucket(htab, b, hash, flags); 1262 if (l_new) 1263 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1264 return ret; 1265 } 1266 1267 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1268 void *value, u64 map_flags) 1269 { 1270 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1271 } 1272 1273 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1274 void *value, u64 map_flags) 1275 { 1276 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1277 false); 1278 } 1279 1280 /* Called from syscall or from eBPF program */ 1281 static int htab_map_delete_elem(struct bpf_map *map, void *key) 1282 { 1283 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1284 struct hlist_nulls_head *head; 1285 struct bucket *b; 1286 struct htab_elem *l; 1287 unsigned long flags; 1288 u32 hash, key_size; 1289 int ret; 1290 1291 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1292 1293 key_size = map->key_size; 1294 1295 hash = htab_map_hash(key, key_size, htab->hashrnd); 1296 b = __select_bucket(htab, hash); 1297 head = &b->head; 1298 1299 ret = htab_lock_bucket(htab, b, hash, &flags); 1300 if (ret) 1301 return ret; 1302 1303 l = lookup_elem_raw(head, hash, key, key_size); 1304 1305 if (l) { 1306 hlist_nulls_del_rcu(&l->hash_node); 1307 free_htab_elem(htab, l); 1308 } else { 1309 ret = -ENOENT; 1310 } 1311 1312 htab_unlock_bucket(htab, b, hash, flags); 1313 return ret; 1314 } 1315 1316 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1317 { 1318 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1319 struct hlist_nulls_head *head; 1320 struct bucket *b; 1321 struct htab_elem *l; 1322 unsigned long flags; 1323 u32 hash, key_size; 1324 int ret; 1325 1326 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); 1327 1328 key_size = map->key_size; 1329 1330 hash = htab_map_hash(key, key_size, htab->hashrnd); 1331 b = __select_bucket(htab, hash); 1332 head = &b->head; 1333 1334 ret = htab_lock_bucket(htab, b, hash, &flags); 1335 if (ret) 1336 return ret; 1337 1338 l = lookup_elem_raw(head, hash, key, key_size); 1339 1340 if (l) 1341 hlist_nulls_del_rcu(&l->hash_node); 1342 else 1343 ret = -ENOENT; 1344 1345 htab_unlock_bucket(htab, b, hash, flags); 1346 if (l) 1347 bpf_lru_push_free(&htab->lru, &l->lru_node); 1348 return ret; 1349 } 1350 1351 static void delete_all_elements(struct bpf_htab *htab) 1352 { 1353 int i; 1354 1355 for (i = 0; i < htab->n_buckets; i++) { 1356 struct hlist_nulls_head *head = select_bucket(htab, i); 1357 struct hlist_nulls_node *n; 1358 struct htab_elem *l; 1359 1360 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1361 hlist_nulls_del_rcu(&l->hash_node); 1362 htab_elem_free(htab, l); 1363 } 1364 } 1365 } 1366 1367 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1368 static void htab_map_free(struct bpf_map *map) 1369 { 1370 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1371 int i; 1372 1373 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1374 * bpf_free_used_maps() is called after bpf prog is no longer executing. 1375 * There is no need to synchronize_rcu() here to protect map elements. 1376 */ 1377 1378 /* some of free_htab_elem() callbacks for elements of this map may 1379 * not have executed. Wait for them. 1380 */ 1381 rcu_barrier(); 1382 if (!htab_is_prealloc(htab)) 1383 delete_all_elements(htab); 1384 else 1385 prealloc_destroy(htab); 1386 1387 free_percpu(htab->extra_elems); 1388 bpf_map_area_free(htab->buckets); 1389 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1390 free_percpu(htab->map_locked[i]); 1391 lockdep_unregister_key(&htab->lockdep_key); 1392 kfree(htab); 1393 } 1394 1395 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1396 struct seq_file *m) 1397 { 1398 void *value; 1399 1400 rcu_read_lock(); 1401 1402 value = htab_map_lookup_elem(map, key); 1403 if (!value) { 1404 rcu_read_unlock(); 1405 return; 1406 } 1407 1408 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1409 seq_puts(m, ": "); 1410 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1411 seq_puts(m, "\n"); 1412 1413 rcu_read_unlock(); 1414 } 1415 1416 static int 1417 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1418 const union bpf_attr *attr, 1419 union bpf_attr __user *uattr, 1420 bool do_delete, bool is_lru_map, 1421 bool is_percpu) 1422 { 1423 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1424 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1425 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1426 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1427 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1428 void *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1429 u32 batch, max_count, size, bucket_size; 1430 struct htab_elem *node_to_free = NULL; 1431 u64 elem_map_flags, map_flags; 1432 struct hlist_nulls_head *head; 1433 struct hlist_nulls_node *n; 1434 unsigned long flags = 0; 1435 bool locked = false; 1436 struct htab_elem *l; 1437 struct bucket *b; 1438 int ret = 0; 1439 1440 elem_map_flags = attr->batch.elem_flags; 1441 if ((elem_map_flags & ~BPF_F_LOCK) || 1442 ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) 1443 return -EINVAL; 1444 1445 map_flags = attr->batch.flags; 1446 if (map_flags) 1447 return -EINVAL; 1448 1449 max_count = attr->batch.count; 1450 if (!max_count) 1451 return 0; 1452 1453 if (put_user(0, &uattr->batch.count)) 1454 return -EFAULT; 1455 1456 batch = 0; 1457 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1458 return -EFAULT; 1459 1460 if (batch >= htab->n_buckets) 1461 return -ENOENT; 1462 1463 key_size = htab->map.key_size; 1464 roundup_key_size = round_up(htab->map.key_size, 8); 1465 value_size = htab->map.value_size; 1466 size = round_up(value_size, 8); 1467 if (is_percpu) 1468 value_size = size * num_possible_cpus(); 1469 total = 0; 1470 /* while experimenting with hash tables with sizes ranging from 10 to 1471 * 1000, it was observed that a bucket can have upto 5 entries. 1472 */ 1473 bucket_size = 5; 1474 1475 alloc: 1476 /* We cannot do copy_from_user or copy_to_user inside 1477 * the rcu_read_lock. Allocate enough space here. 1478 */ 1479 keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); 1480 values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); 1481 if (!keys || !values) { 1482 ret = -ENOMEM; 1483 goto after_loop; 1484 } 1485 1486 again: 1487 bpf_disable_instrumentation(); 1488 rcu_read_lock(); 1489 again_nocopy: 1490 dst_key = keys; 1491 dst_val = values; 1492 b = &htab->buckets[batch]; 1493 head = &b->head; 1494 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1495 if (locked) { 1496 ret = htab_lock_bucket(htab, b, batch, &flags); 1497 if (ret) 1498 goto next_batch; 1499 } 1500 1501 bucket_cnt = 0; 1502 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1503 bucket_cnt++; 1504 1505 if (bucket_cnt && !locked) { 1506 locked = true; 1507 goto again_nocopy; 1508 } 1509 1510 if (bucket_cnt > (max_count - total)) { 1511 if (total == 0) 1512 ret = -ENOSPC; 1513 /* Note that since bucket_cnt > 0 here, it is implicit 1514 * that the locked was grabbed, so release it. 1515 */ 1516 htab_unlock_bucket(htab, b, batch, flags); 1517 rcu_read_unlock(); 1518 bpf_enable_instrumentation(); 1519 goto after_loop; 1520 } 1521 1522 if (bucket_cnt > bucket_size) { 1523 bucket_size = bucket_cnt; 1524 /* Note that since bucket_cnt > 0 here, it is implicit 1525 * that the locked was grabbed, so release it. 1526 */ 1527 htab_unlock_bucket(htab, b, batch, flags); 1528 rcu_read_unlock(); 1529 bpf_enable_instrumentation(); 1530 kvfree(keys); 1531 kvfree(values); 1532 goto alloc; 1533 } 1534 1535 /* Next block is only safe to run if you have grabbed the lock */ 1536 if (!locked) 1537 goto next_batch; 1538 1539 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1540 memcpy(dst_key, l->key, key_size); 1541 1542 if (is_percpu) { 1543 int off = 0, cpu; 1544 void __percpu *pptr; 1545 1546 pptr = htab_elem_get_ptr(l, map->key_size); 1547 for_each_possible_cpu(cpu) { 1548 bpf_long_memcpy(dst_val + off, 1549 per_cpu_ptr(pptr, cpu), size); 1550 off += size; 1551 } 1552 } else { 1553 value = l->key + roundup_key_size; 1554 if (elem_map_flags & BPF_F_LOCK) 1555 copy_map_value_locked(map, dst_val, value, 1556 true); 1557 else 1558 copy_map_value(map, dst_val, value); 1559 check_and_init_map_lock(map, dst_val); 1560 } 1561 if (do_delete) { 1562 hlist_nulls_del_rcu(&l->hash_node); 1563 1564 /* bpf_lru_push_free() will acquire lru_lock, which 1565 * may cause deadlock. See comments in function 1566 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1567 * after releasing the bucket lock. 1568 */ 1569 if (is_lru_map) { 1570 l->batch_flink = node_to_free; 1571 node_to_free = l; 1572 } else { 1573 free_htab_elem(htab, l); 1574 } 1575 } 1576 dst_key += key_size; 1577 dst_val += value_size; 1578 } 1579 1580 htab_unlock_bucket(htab, b, batch, flags); 1581 locked = false; 1582 1583 while (node_to_free) { 1584 l = node_to_free; 1585 node_to_free = node_to_free->batch_flink; 1586 bpf_lru_push_free(&htab->lru, &l->lru_node); 1587 } 1588 1589 next_batch: 1590 /* If we are not copying data, we can go to next bucket and avoid 1591 * unlocking the rcu. 1592 */ 1593 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1594 batch++; 1595 goto again_nocopy; 1596 } 1597 1598 rcu_read_unlock(); 1599 bpf_enable_instrumentation(); 1600 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1601 key_size * bucket_cnt) || 1602 copy_to_user(uvalues + total * value_size, values, 1603 value_size * bucket_cnt))) { 1604 ret = -EFAULT; 1605 goto after_loop; 1606 } 1607 1608 total += bucket_cnt; 1609 batch++; 1610 if (batch >= htab->n_buckets) { 1611 ret = -ENOENT; 1612 goto after_loop; 1613 } 1614 goto again; 1615 1616 after_loop: 1617 if (ret == -EFAULT) 1618 goto out; 1619 1620 /* copy # of entries and next batch */ 1621 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1622 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1623 put_user(total, &uattr->batch.count)) 1624 ret = -EFAULT; 1625 1626 out: 1627 kvfree(keys); 1628 kvfree(values); 1629 return ret; 1630 } 1631 1632 static int 1633 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1634 union bpf_attr __user *uattr) 1635 { 1636 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1637 false, true); 1638 } 1639 1640 static int 1641 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1642 const union bpf_attr *attr, 1643 union bpf_attr __user *uattr) 1644 { 1645 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1646 false, true); 1647 } 1648 1649 static int 1650 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1651 union bpf_attr __user *uattr) 1652 { 1653 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1654 false, false); 1655 } 1656 1657 static int 1658 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1659 const union bpf_attr *attr, 1660 union bpf_attr __user *uattr) 1661 { 1662 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1663 false, false); 1664 } 1665 1666 static int 1667 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1668 const union bpf_attr *attr, 1669 union bpf_attr __user *uattr) 1670 { 1671 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1672 true, true); 1673 } 1674 1675 static int 1676 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1677 const union bpf_attr *attr, 1678 union bpf_attr __user *uattr) 1679 { 1680 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1681 true, true); 1682 } 1683 1684 static int 1685 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1686 union bpf_attr __user *uattr) 1687 { 1688 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1689 true, false); 1690 } 1691 1692 static int 1693 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1694 const union bpf_attr *attr, 1695 union bpf_attr __user *uattr) 1696 { 1697 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1698 true, false); 1699 } 1700 1701 struct bpf_iter_seq_hash_map_info { 1702 struct bpf_map *map; 1703 struct bpf_htab *htab; 1704 void *percpu_value_buf; // non-zero means percpu hash 1705 u32 bucket_id; 1706 u32 skip_elems; 1707 }; 1708 1709 static struct htab_elem * 1710 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, 1711 struct htab_elem *prev_elem) 1712 { 1713 const struct bpf_htab *htab = info->htab; 1714 u32 skip_elems = info->skip_elems; 1715 u32 bucket_id = info->bucket_id; 1716 struct hlist_nulls_head *head; 1717 struct hlist_nulls_node *n; 1718 struct htab_elem *elem; 1719 struct bucket *b; 1720 u32 i, count; 1721 1722 if (bucket_id >= htab->n_buckets) 1723 return NULL; 1724 1725 /* try to find next elem in the same bucket */ 1726 if (prev_elem) { 1727 /* no update/deletion on this bucket, prev_elem should be still valid 1728 * and we won't skip elements. 1729 */ 1730 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); 1731 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); 1732 if (elem) 1733 return elem; 1734 1735 /* not found, unlock and go to the next bucket */ 1736 b = &htab->buckets[bucket_id++]; 1737 rcu_read_unlock(); 1738 skip_elems = 0; 1739 } 1740 1741 for (i = bucket_id; i < htab->n_buckets; i++) { 1742 b = &htab->buckets[i]; 1743 rcu_read_lock(); 1744 1745 count = 0; 1746 head = &b->head; 1747 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 1748 if (count >= skip_elems) { 1749 info->bucket_id = i; 1750 info->skip_elems = count; 1751 return elem; 1752 } 1753 count++; 1754 } 1755 1756 rcu_read_unlock(); 1757 skip_elems = 0; 1758 } 1759 1760 info->bucket_id = i; 1761 info->skip_elems = 0; 1762 return NULL; 1763 } 1764 1765 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) 1766 { 1767 struct bpf_iter_seq_hash_map_info *info = seq->private; 1768 struct htab_elem *elem; 1769 1770 elem = bpf_hash_map_seq_find_next(info, NULL); 1771 if (!elem) 1772 return NULL; 1773 1774 if (*pos == 0) 1775 ++*pos; 1776 return elem; 1777 } 1778 1779 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1780 { 1781 struct bpf_iter_seq_hash_map_info *info = seq->private; 1782 1783 ++*pos; 1784 ++info->skip_elems; 1785 return bpf_hash_map_seq_find_next(info, v); 1786 } 1787 1788 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) 1789 { 1790 struct bpf_iter_seq_hash_map_info *info = seq->private; 1791 u32 roundup_key_size, roundup_value_size; 1792 struct bpf_iter__bpf_map_elem ctx = {}; 1793 struct bpf_map *map = info->map; 1794 struct bpf_iter_meta meta; 1795 int ret = 0, off = 0, cpu; 1796 struct bpf_prog *prog; 1797 void __percpu *pptr; 1798 1799 meta.seq = seq; 1800 prog = bpf_iter_get_info(&meta, elem == NULL); 1801 if (prog) { 1802 ctx.meta = &meta; 1803 ctx.map = info->map; 1804 if (elem) { 1805 roundup_key_size = round_up(map->key_size, 8); 1806 ctx.key = elem->key; 1807 if (!info->percpu_value_buf) { 1808 ctx.value = elem->key + roundup_key_size; 1809 } else { 1810 roundup_value_size = round_up(map->value_size, 8); 1811 pptr = htab_elem_get_ptr(elem, map->key_size); 1812 for_each_possible_cpu(cpu) { 1813 bpf_long_memcpy(info->percpu_value_buf + off, 1814 per_cpu_ptr(pptr, cpu), 1815 roundup_value_size); 1816 off += roundup_value_size; 1817 } 1818 ctx.value = info->percpu_value_buf; 1819 } 1820 } 1821 ret = bpf_iter_run_prog(prog, &ctx); 1822 } 1823 1824 return ret; 1825 } 1826 1827 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) 1828 { 1829 return __bpf_hash_map_seq_show(seq, v); 1830 } 1831 1832 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) 1833 { 1834 if (!v) 1835 (void)__bpf_hash_map_seq_show(seq, NULL); 1836 else 1837 rcu_read_unlock(); 1838 } 1839 1840 static int bpf_iter_init_hash_map(void *priv_data, 1841 struct bpf_iter_aux_info *aux) 1842 { 1843 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 1844 struct bpf_map *map = aux->map; 1845 void *value_buf; 1846 u32 buf_size; 1847 1848 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 1849 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 1850 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 1851 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 1852 if (!value_buf) 1853 return -ENOMEM; 1854 1855 seq_info->percpu_value_buf = value_buf; 1856 } 1857 1858 seq_info->map = map; 1859 seq_info->htab = container_of(map, struct bpf_htab, map); 1860 return 0; 1861 } 1862 1863 static void bpf_iter_fini_hash_map(void *priv_data) 1864 { 1865 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 1866 1867 kfree(seq_info->percpu_value_buf); 1868 } 1869 1870 static const struct seq_operations bpf_hash_map_seq_ops = { 1871 .start = bpf_hash_map_seq_start, 1872 .next = bpf_hash_map_seq_next, 1873 .stop = bpf_hash_map_seq_stop, 1874 .show = bpf_hash_map_seq_show, 1875 }; 1876 1877 static const struct bpf_iter_seq_info iter_seq_info = { 1878 .seq_ops = &bpf_hash_map_seq_ops, 1879 .init_seq_private = bpf_iter_init_hash_map, 1880 .fini_seq_private = bpf_iter_fini_hash_map, 1881 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), 1882 }; 1883 1884 static int htab_map_btf_id; 1885 const struct bpf_map_ops htab_map_ops = { 1886 .map_meta_equal = bpf_map_meta_equal, 1887 .map_alloc_check = htab_map_alloc_check, 1888 .map_alloc = htab_map_alloc, 1889 .map_free = htab_map_free, 1890 .map_get_next_key = htab_map_get_next_key, 1891 .map_lookup_elem = htab_map_lookup_elem, 1892 .map_update_elem = htab_map_update_elem, 1893 .map_delete_elem = htab_map_delete_elem, 1894 .map_gen_lookup = htab_map_gen_lookup, 1895 .map_seq_show_elem = htab_map_seq_show_elem, 1896 BATCH_OPS(htab), 1897 .map_btf_name = "bpf_htab", 1898 .map_btf_id = &htab_map_btf_id, 1899 .iter_seq_info = &iter_seq_info, 1900 }; 1901 1902 static int htab_lru_map_btf_id; 1903 const struct bpf_map_ops htab_lru_map_ops = { 1904 .map_meta_equal = bpf_map_meta_equal, 1905 .map_alloc_check = htab_map_alloc_check, 1906 .map_alloc = htab_map_alloc, 1907 .map_free = htab_map_free, 1908 .map_get_next_key = htab_map_get_next_key, 1909 .map_lookup_elem = htab_lru_map_lookup_elem, 1910 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 1911 .map_update_elem = htab_lru_map_update_elem, 1912 .map_delete_elem = htab_lru_map_delete_elem, 1913 .map_gen_lookup = htab_lru_map_gen_lookup, 1914 .map_seq_show_elem = htab_map_seq_show_elem, 1915 BATCH_OPS(htab_lru), 1916 .map_btf_name = "bpf_htab", 1917 .map_btf_id = &htab_lru_map_btf_id, 1918 .iter_seq_info = &iter_seq_info, 1919 }; 1920 1921 /* Called from eBPF program */ 1922 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1923 { 1924 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1925 1926 if (l) 1927 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1928 else 1929 return NULL; 1930 } 1931 1932 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1933 { 1934 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1935 1936 if (l) { 1937 bpf_lru_node_set_ref(&l->lru_node); 1938 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1939 } 1940 1941 return NULL; 1942 } 1943 1944 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1945 { 1946 struct htab_elem *l; 1947 void __percpu *pptr; 1948 int ret = -ENOENT; 1949 int cpu, off = 0; 1950 u32 size; 1951 1952 /* per_cpu areas are zero-filled and bpf programs can only 1953 * access 'value_size' of them, so copying rounded areas 1954 * will not leak any kernel data 1955 */ 1956 size = round_up(map->value_size, 8); 1957 rcu_read_lock(); 1958 l = __htab_map_lookup_elem(map, key); 1959 if (!l) 1960 goto out; 1961 /* We do not mark LRU map element here in order to not mess up 1962 * eviction heuristics when user space does a map walk. 1963 */ 1964 pptr = htab_elem_get_ptr(l, map->key_size); 1965 for_each_possible_cpu(cpu) { 1966 bpf_long_memcpy(value + off, 1967 per_cpu_ptr(pptr, cpu), size); 1968 off += size; 1969 } 1970 ret = 0; 1971 out: 1972 rcu_read_unlock(); 1973 return ret; 1974 } 1975 1976 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1977 u64 map_flags) 1978 { 1979 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1980 int ret; 1981 1982 rcu_read_lock(); 1983 if (htab_is_lru(htab)) 1984 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1985 map_flags, true); 1986 else 1987 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1988 true); 1989 rcu_read_unlock(); 1990 1991 return ret; 1992 } 1993 1994 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 1995 struct seq_file *m) 1996 { 1997 struct htab_elem *l; 1998 void __percpu *pptr; 1999 int cpu; 2000 2001 rcu_read_lock(); 2002 2003 l = __htab_map_lookup_elem(map, key); 2004 if (!l) { 2005 rcu_read_unlock(); 2006 return; 2007 } 2008 2009 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 2010 seq_puts(m, ": {\n"); 2011 pptr = htab_elem_get_ptr(l, map->key_size); 2012 for_each_possible_cpu(cpu) { 2013 seq_printf(m, "\tcpu%d: ", cpu); 2014 btf_type_seq_show(map->btf, map->btf_value_type_id, 2015 per_cpu_ptr(pptr, cpu), m); 2016 seq_puts(m, "\n"); 2017 } 2018 seq_puts(m, "}\n"); 2019 2020 rcu_read_unlock(); 2021 } 2022 2023 static int htab_percpu_map_btf_id; 2024 const struct bpf_map_ops htab_percpu_map_ops = { 2025 .map_meta_equal = bpf_map_meta_equal, 2026 .map_alloc_check = htab_map_alloc_check, 2027 .map_alloc = htab_map_alloc, 2028 .map_free = htab_map_free, 2029 .map_get_next_key = htab_map_get_next_key, 2030 .map_lookup_elem = htab_percpu_map_lookup_elem, 2031 .map_update_elem = htab_percpu_map_update_elem, 2032 .map_delete_elem = htab_map_delete_elem, 2033 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2034 BATCH_OPS(htab_percpu), 2035 .map_btf_name = "bpf_htab", 2036 .map_btf_id = &htab_percpu_map_btf_id, 2037 .iter_seq_info = &iter_seq_info, 2038 }; 2039 2040 static int htab_lru_percpu_map_btf_id; 2041 const struct bpf_map_ops htab_lru_percpu_map_ops = { 2042 .map_meta_equal = bpf_map_meta_equal, 2043 .map_alloc_check = htab_map_alloc_check, 2044 .map_alloc = htab_map_alloc, 2045 .map_free = htab_map_free, 2046 .map_get_next_key = htab_map_get_next_key, 2047 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 2048 .map_update_elem = htab_lru_percpu_map_update_elem, 2049 .map_delete_elem = htab_lru_map_delete_elem, 2050 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2051 BATCH_OPS(htab_lru_percpu), 2052 .map_btf_name = "bpf_htab", 2053 .map_btf_id = &htab_lru_percpu_map_btf_id, 2054 .iter_seq_info = &iter_seq_info, 2055 }; 2056 2057 static int fd_htab_map_alloc_check(union bpf_attr *attr) 2058 { 2059 if (attr->value_size != sizeof(u32)) 2060 return -EINVAL; 2061 return htab_map_alloc_check(attr); 2062 } 2063 2064 static void fd_htab_map_free(struct bpf_map *map) 2065 { 2066 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2067 struct hlist_nulls_node *n; 2068 struct hlist_nulls_head *head; 2069 struct htab_elem *l; 2070 int i; 2071 2072 for (i = 0; i < htab->n_buckets; i++) { 2073 head = select_bucket(htab, i); 2074 2075 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 2076 void *ptr = fd_htab_map_get_ptr(map, l); 2077 2078 map->ops->map_fd_put_ptr(ptr); 2079 } 2080 } 2081 2082 htab_map_free(map); 2083 } 2084 2085 /* only called from syscall */ 2086 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 2087 { 2088 void **ptr; 2089 int ret = 0; 2090 2091 if (!map->ops->map_fd_sys_lookup_elem) 2092 return -ENOTSUPP; 2093 2094 rcu_read_lock(); 2095 ptr = htab_map_lookup_elem(map, key); 2096 if (ptr) 2097 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 2098 else 2099 ret = -ENOENT; 2100 rcu_read_unlock(); 2101 2102 return ret; 2103 } 2104 2105 /* only called from syscall */ 2106 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 2107 void *key, void *value, u64 map_flags) 2108 { 2109 void *ptr; 2110 int ret; 2111 u32 ufd = *(u32 *)value; 2112 2113 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 2114 if (IS_ERR(ptr)) 2115 return PTR_ERR(ptr); 2116 2117 ret = htab_map_update_elem(map, key, &ptr, map_flags); 2118 if (ret) 2119 map->ops->map_fd_put_ptr(ptr); 2120 2121 return ret; 2122 } 2123 2124 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 2125 { 2126 struct bpf_map *map, *inner_map_meta; 2127 2128 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 2129 if (IS_ERR(inner_map_meta)) 2130 return inner_map_meta; 2131 2132 map = htab_map_alloc(attr); 2133 if (IS_ERR(map)) { 2134 bpf_map_meta_free(inner_map_meta); 2135 return map; 2136 } 2137 2138 map->inner_map_meta = inner_map_meta; 2139 2140 return map; 2141 } 2142 2143 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 2144 { 2145 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 2146 2147 if (!inner_map) 2148 return NULL; 2149 2150 return READ_ONCE(*inner_map); 2151 } 2152 2153 static int htab_of_map_gen_lookup(struct bpf_map *map, 2154 struct bpf_insn *insn_buf) 2155 { 2156 struct bpf_insn *insn = insn_buf; 2157 const int ret = BPF_REG_0; 2158 2159 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 2160 (void *(*)(struct bpf_map *map, void *key))NULL)); 2161 *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); 2162 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 2163 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 2164 offsetof(struct htab_elem, key) + 2165 round_up(map->key_size, 8)); 2166 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 2167 2168 return insn - insn_buf; 2169 } 2170 2171 static void htab_of_map_free(struct bpf_map *map) 2172 { 2173 bpf_map_meta_free(map->inner_map_meta); 2174 fd_htab_map_free(map); 2175 } 2176 2177 static int htab_of_maps_map_btf_id; 2178 const struct bpf_map_ops htab_of_maps_map_ops = { 2179 .map_alloc_check = fd_htab_map_alloc_check, 2180 .map_alloc = htab_of_map_alloc, 2181 .map_free = htab_of_map_free, 2182 .map_get_next_key = htab_map_get_next_key, 2183 .map_lookup_elem = htab_of_map_lookup_elem, 2184 .map_delete_elem = htab_map_delete_elem, 2185 .map_fd_get_ptr = bpf_map_fd_get_ptr, 2186 .map_fd_put_ptr = bpf_map_fd_put_ptr, 2187 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 2188 .map_gen_lookup = htab_of_map_gen_lookup, 2189 .map_check_btf = map_check_no_btf, 2190 .map_btf_name = "bpf_htab", 2191 .map_btf_id = &htab_of_maps_map_btf_id, 2192 }; 2193