1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/jhash.h> 8 #include <linux/filter.h> 9 #include <linux/rculist_nulls.h> 10 #include <linux/rcupdate_wait.h> 11 #include <linux/random.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 #include <linux/btf_ids.h> 15 #include "percpu_freelist.h" 16 #include "bpf_lru_list.h" 17 #include "map_in_map.h" 18 #include <linux/bpf_mem_alloc.h> 19 20 #define HTAB_CREATE_FLAG_MASK \ 21 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ 22 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) 23 24 #define BATCH_OPS(_name) \ 25 .map_lookup_batch = \ 26 _name##_map_lookup_batch, \ 27 .map_lookup_and_delete_batch = \ 28 _name##_map_lookup_and_delete_batch, \ 29 .map_update_batch = \ 30 generic_map_update_batch, \ 31 .map_delete_batch = \ 32 generic_map_delete_batch 33 34 /* 35 * The bucket lock has two protection scopes: 36 * 37 * 1) Serializing concurrent operations from BPF programs on different 38 * CPUs 39 * 40 * 2) Serializing concurrent operations from BPF programs and sys_bpf() 41 * 42 * BPF programs can execute in any context including perf, kprobes and 43 * tracing. As there are almost no limits where perf, kprobes and tracing 44 * can be invoked from the lock operations need to be protected against 45 * deadlocks. Deadlocks can be caused by recursion and by an invocation in 46 * the lock held section when functions which acquire this lock are invoked 47 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU 48 * variable bpf_prog_active, which prevents BPF programs attached to perf 49 * events, kprobes and tracing to be invoked before the prior invocation 50 * from one of these contexts completed. sys_bpf() uses the same mechanism 51 * by pinning the task to the current CPU and incrementing the recursion 52 * protection across the map operation. 53 * 54 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain 55 * operations like memory allocations (even with GFP_ATOMIC) from atomic 56 * contexts. This is required because even with GFP_ATOMIC the memory 57 * allocator calls into code paths which acquire locks with long held lock 58 * sections. To ensure the deterministic behaviour these locks are regular 59 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only 60 * true atomic contexts on an RT kernel are the low level hardware 61 * handling, scheduling, low level interrupt handling, NMIs etc. None of 62 * these contexts should ever do memory allocations. 63 * 64 * As regular device interrupt handlers and soft interrupts are forced into 65 * thread context, the existing code which does 66 * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); 67 * just works. 68 * 69 * In theory the BPF locks could be converted to regular spinlocks as well, 70 * but the bucket locks and percpu_freelist locks can be taken from 71 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be 72 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, 73 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, 74 * because there is no memory allocation within the lock held sections. However 75 * after hash map was fully converted to use bpf_mem_alloc, there will be 76 * non-synchronous memory allocation for non-preallocated hash map, so it is 77 * safe to always use raw spinlock for bucket lock. 78 */ 79 struct bucket { 80 struct hlist_nulls_head head; 81 raw_spinlock_t raw_lock; 82 }; 83 84 #define HASHTAB_MAP_LOCK_COUNT 8 85 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) 86 87 struct bpf_htab { 88 struct bpf_map map; 89 struct bpf_mem_alloc ma; 90 struct bpf_mem_alloc pcpu_ma; 91 struct bucket *buckets; 92 void *elems; 93 union { 94 struct pcpu_freelist freelist; 95 struct bpf_lru lru; 96 }; 97 struct htab_elem *__percpu *extra_elems; 98 /* number of elements in non-preallocated hashtable are kept 99 * in either pcount or count 100 */ 101 struct percpu_counter pcount; 102 atomic_t count; 103 bool use_percpu_counter; 104 u32 n_buckets; /* number of hash buckets */ 105 u32 elem_size; /* size of each element in bytes */ 106 u32 hashrnd; 107 struct lock_class_key lockdep_key; 108 int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; 109 }; 110 111 /* each htab element is struct htab_elem + key + value */ 112 struct htab_elem { 113 union { 114 struct hlist_nulls_node hash_node; 115 struct { 116 void *padding; 117 union { 118 struct pcpu_freelist_node fnode; 119 struct htab_elem *batch_flink; 120 }; 121 }; 122 }; 123 union { 124 /* pointer to per-cpu pointer */ 125 void *ptr_to_pptr; 126 struct bpf_lru_node lru_node; 127 }; 128 u32 hash; 129 char key[] __aligned(8); 130 }; 131 132 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 133 { 134 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); 135 } 136 137 static void htab_init_buckets(struct bpf_htab *htab) 138 { 139 unsigned int i; 140 141 for (i = 0; i < htab->n_buckets; i++) { 142 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 143 raw_spin_lock_init(&htab->buckets[i].raw_lock); 144 lockdep_set_class(&htab->buckets[i].raw_lock, 145 &htab->lockdep_key); 146 cond_resched(); 147 } 148 } 149 150 static inline int htab_lock_bucket(const struct bpf_htab *htab, 151 struct bucket *b, u32 hash, 152 unsigned long *pflags) 153 { 154 unsigned long flags; 155 156 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 157 158 preempt_disable(); 159 local_irq_save(flags); 160 if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { 161 __this_cpu_dec(*(htab->map_locked[hash])); 162 local_irq_restore(flags); 163 preempt_enable(); 164 return -EBUSY; 165 } 166 167 raw_spin_lock(&b->raw_lock); 168 *pflags = flags; 169 170 return 0; 171 } 172 173 static inline void htab_unlock_bucket(const struct bpf_htab *htab, 174 struct bucket *b, u32 hash, 175 unsigned long flags) 176 { 177 hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); 178 raw_spin_unlock(&b->raw_lock); 179 __this_cpu_dec(*(htab->map_locked[hash])); 180 local_irq_restore(flags); 181 preempt_enable(); 182 } 183 184 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 185 186 static bool htab_is_lru(const struct bpf_htab *htab) 187 { 188 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 189 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 190 } 191 192 static bool htab_is_percpu(const struct bpf_htab *htab) 193 { 194 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 195 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 196 } 197 198 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 199 void __percpu *pptr) 200 { 201 *(void __percpu **)(l->key + key_size) = pptr; 202 } 203 204 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 205 { 206 return *(void __percpu **)(l->key + key_size); 207 } 208 209 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) 210 { 211 return *(void **)(l->key + roundup(map->key_size, 8)); 212 } 213 214 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 215 { 216 return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); 217 } 218 219 static bool htab_has_extra_elems(struct bpf_htab *htab) 220 { 221 return !htab_is_percpu(htab) && !htab_is_lru(htab); 222 } 223 224 static void htab_free_prealloced_timers(struct bpf_htab *htab) 225 { 226 u32 num_entries = htab->map.max_entries; 227 int i; 228 229 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 230 return; 231 if (htab_has_extra_elems(htab)) 232 num_entries += num_possible_cpus(); 233 234 for (i = 0; i < num_entries; i++) { 235 struct htab_elem *elem; 236 237 elem = get_htab_elem(htab, i); 238 bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 239 cond_resched(); 240 } 241 } 242 243 static void htab_free_prealloced_fields(struct bpf_htab *htab) 244 { 245 u32 num_entries = htab->map.max_entries; 246 int i; 247 248 if (IS_ERR_OR_NULL(htab->map.record)) 249 return; 250 if (htab_has_extra_elems(htab)) 251 num_entries += num_possible_cpus(); 252 for (i = 0; i < num_entries; i++) { 253 struct htab_elem *elem; 254 255 elem = get_htab_elem(htab, i); 256 if (htab_is_percpu(htab)) { 257 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 258 int cpu; 259 260 for_each_possible_cpu(cpu) { 261 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 262 cond_resched(); 263 } 264 } else { 265 bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); 266 cond_resched(); 267 } 268 cond_resched(); 269 } 270 } 271 272 static void htab_free_elems(struct bpf_htab *htab) 273 { 274 int i; 275 276 if (!htab_is_percpu(htab)) 277 goto free_elems; 278 279 for (i = 0; i < htab->map.max_entries; i++) { 280 void __percpu *pptr; 281 282 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 283 htab->map.key_size); 284 free_percpu(pptr); 285 cond_resched(); 286 } 287 free_elems: 288 bpf_map_area_free(htab->elems); 289 } 290 291 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock 292 * (bucket_lock). If both locks need to be acquired together, the lock 293 * order is always lru_lock -> bucket_lock and this only happens in 294 * bpf_lru_list.c logic. For example, certain code path of 295 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), 296 * will acquire lru_lock first followed by acquiring bucket_lock. 297 * 298 * In hashtab.c, to avoid deadlock, lock acquisition of 299 * bucket_lock followed by lru_lock is not allowed. In such cases, 300 * bucket_lock needs to be released first before acquiring lru_lock. 301 */ 302 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 303 u32 hash) 304 { 305 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 306 struct htab_elem *l; 307 308 if (node) { 309 bpf_map_inc_elem_count(&htab->map); 310 l = container_of(node, struct htab_elem, lru_node); 311 memcpy(l->key, key, htab->map.key_size); 312 return l; 313 } 314 315 return NULL; 316 } 317 318 static int prealloc_init(struct bpf_htab *htab) 319 { 320 u32 num_entries = htab->map.max_entries; 321 int err = -ENOMEM, i; 322 323 if (htab_has_extra_elems(htab)) 324 num_entries += num_possible_cpus(); 325 326 htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, 327 htab->map.numa_node); 328 if (!htab->elems) 329 return -ENOMEM; 330 331 if (!htab_is_percpu(htab)) 332 goto skip_percpu_elems; 333 334 for (i = 0; i < num_entries; i++) { 335 u32 size = round_up(htab->map.value_size, 8); 336 void __percpu *pptr; 337 338 pptr = bpf_map_alloc_percpu(&htab->map, size, 8, 339 GFP_USER | __GFP_NOWARN); 340 if (!pptr) 341 goto free_elems; 342 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 343 pptr); 344 cond_resched(); 345 } 346 347 skip_percpu_elems: 348 if (htab_is_lru(htab)) 349 err = bpf_lru_init(&htab->lru, 350 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 351 offsetof(struct htab_elem, hash) - 352 offsetof(struct htab_elem, lru_node), 353 htab_lru_map_delete_node, 354 htab); 355 else 356 err = pcpu_freelist_init(&htab->freelist); 357 358 if (err) 359 goto free_elems; 360 361 if (htab_is_lru(htab)) 362 bpf_lru_populate(&htab->lru, htab->elems, 363 offsetof(struct htab_elem, lru_node), 364 htab->elem_size, num_entries); 365 else 366 pcpu_freelist_populate(&htab->freelist, 367 htab->elems + offsetof(struct htab_elem, fnode), 368 htab->elem_size, num_entries); 369 370 return 0; 371 372 free_elems: 373 htab_free_elems(htab); 374 return err; 375 } 376 377 static void prealloc_destroy(struct bpf_htab *htab) 378 { 379 htab_free_elems(htab); 380 381 if (htab_is_lru(htab)) 382 bpf_lru_destroy(&htab->lru); 383 else 384 pcpu_freelist_destroy(&htab->freelist); 385 } 386 387 static int alloc_extra_elems(struct bpf_htab *htab) 388 { 389 struct htab_elem *__percpu *pptr, *l_new; 390 struct pcpu_freelist_node *l; 391 int cpu; 392 393 pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, 394 GFP_USER | __GFP_NOWARN); 395 if (!pptr) 396 return -ENOMEM; 397 398 for_each_possible_cpu(cpu) { 399 l = pcpu_freelist_pop(&htab->freelist); 400 /* pop will succeed, since prealloc_init() 401 * preallocated extra num_possible_cpus elements 402 */ 403 l_new = container_of(l, struct htab_elem, fnode); 404 *per_cpu_ptr(pptr, cpu) = l_new; 405 } 406 htab->extra_elems = pptr; 407 return 0; 408 } 409 410 /* Called from syscall */ 411 static int htab_map_alloc_check(union bpf_attr *attr) 412 { 413 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 414 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 415 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 416 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 417 /* percpu_lru means each cpu has its own LRU list. 418 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 419 * the map's value itself is percpu. percpu_lru has 420 * nothing to do with the map's value. 421 */ 422 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 423 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 424 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); 425 int numa_node = bpf_map_attr_numa_node(attr); 426 427 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 428 offsetof(struct htab_elem, hash_node.pprev)); 429 430 if (zero_seed && !capable(CAP_SYS_ADMIN)) 431 /* Guard against local DoS, and discourage production use. */ 432 return -EPERM; 433 434 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || 435 !bpf_map_flags_access_ok(attr->map_flags)) 436 return -EINVAL; 437 438 if (!lru && percpu_lru) 439 return -EINVAL; 440 441 if (lru && !prealloc) 442 return -ENOTSUPP; 443 444 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 445 return -EINVAL; 446 447 /* check sanity of attributes. 448 * value_size == 0 may be allowed in the future to use map as a set 449 */ 450 if (attr->max_entries == 0 || attr->key_size == 0 || 451 attr->value_size == 0) 452 return -EINVAL; 453 454 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - 455 sizeof(struct htab_elem)) 456 /* if key_size + value_size is bigger, the user space won't be 457 * able to access the elements via bpf syscall. This check 458 * also makes sure that the elem_size doesn't overflow and it's 459 * kmalloc-able later in htab_map_update_elem() 460 */ 461 return -E2BIG; 462 463 return 0; 464 } 465 466 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 467 { 468 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 469 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 470 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 471 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 472 /* percpu_lru means each cpu has its own LRU list. 473 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 474 * the map's value itself is percpu. percpu_lru has 475 * nothing to do with the map's value. 476 */ 477 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 478 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 479 struct bpf_htab *htab; 480 int err, i; 481 482 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); 483 if (!htab) 484 return ERR_PTR(-ENOMEM); 485 486 lockdep_register_key(&htab->lockdep_key); 487 488 bpf_map_init_from_attr(&htab->map, attr); 489 490 if (percpu_lru) { 491 /* ensure each CPU's lru list has >=1 elements. 492 * since we are at it, make each lru list has the same 493 * number of elements. 494 */ 495 htab->map.max_entries = roundup(attr->max_entries, 496 num_possible_cpus()); 497 if (htab->map.max_entries < attr->max_entries) 498 htab->map.max_entries = rounddown(attr->max_entries, 499 num_possible_cpus()); 500 } 501 502 /* hash table size must be power of 2; roundup_pow_of_two() can overflow 503 * into UB on 32-bit arches, so check that first 504 */ 505 err = -E2BIG; 506 if (htab->map.max_entries > 1UL << 31) 507 goto free_htab; 508 509 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 510 511 htab->elem_size = sizeof(struct htab_elem) + 512 round_up(htab->map.key_size, 8); 513 if (percpu) 514 htab->elem_size += sizeof(void *); 515 else 516 htab->elem_size += round_up(htab->map.value_size, 8); 517 518 /* check for u32 overflow */ 519 if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) 520 goto free_htab; 521 522 err = bpf_map_init_elem_count(&htab->map); 523 if (err) 524 goto free_htab; 525 526 err = -ENOMEM; 527 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 528 sizeof(struct bucket), 529 htab->map.numa_node); 530 if (!htab->buckets) 531 goto free_elem_count; 532 533 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { 534 htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, 535 sizeof(int), 536 sizeof(int), 537 GFP_USER); 538 if (!htab->map_locked[i]) 539 goto free_map_locked; 540 } 541 542 if (htab->map.map_flags & BPF_F_ZERO_SEED) 543 htab->hashrnd = 0; 544 else 545 htab->hashrnd = get_random_u32(); 546 547 htab_init_buckets(htab); 548 549 /* compute_batch_value() computes batch value as num_online_cpus() * 2 550 * and __percpu_counter_compare() needs 551 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() 552 * for percpu_counter to be faster than atomic_t. In practice the average bpf 553 * hash map size is 10k, which means that a system with 64 cpus will fill 554 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore 555 * define our own batch count as 32 then 10k hash map can be filled up to 80%: 556 * 10k - 8k > 32 _batch_ * 64 _cpus_ 557 * and __percpu_counter_compare() will still be fast. At that point hash map 558 * collisions will dominate its performance anyway. Assume that hash map filled 559 * to 50+% isn't going to be O(1) and use the following formula to choose 560 * between percpu_counter and atomic_t. 561 */ 562 #define PERCPU_COUNTER_BATCH 32 563 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) 564 htab->use_percpu_counter = true; 565 566 if (htab->use_percpu_counter) { 567 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); 568 if (err) 569 goto free_map_locked; 570 } 571 572 if (prealloc) { 573 err = prealloc_init(htab); 574 if (err) 575 goto free_map_locked; 576 577 if (!percpu && !lru) { 578 /* lru itself can remove the least used element, so 579 * there is no need for an extra elem during map_update. 580 */ 581 err = alloc_extra_elems(htab); 582 if (err) 583 goto free_prealloc; 584 } 585 } else { 586 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); 587 if (err) 588 goto free_map_locked; 589 if (percpu) { 590 err = bpf_mem_alloc_init(&htab->pcpu_ma, 591 round_up(htab->map.value_size, 8), true); 592 if (err) 593 goto free_map_locked; 594 } 595 } 596 597 return &htab->map; 598 599 free_prealloc: 600 prealloc_destroy(htab); 601 free_map_locked: 602 if (htab->use_percpu_counter) 603 percpu_counter_destroy(&htab->pcount); 604 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 605 free_percpu(htab->map_locked[i]); 606 bpf_map_area_free(htab->buckets); 607 bpf_mem_alloc_destroy(&htab->pcpu_ma); 608 bpf_mem_alloc_destroy(&htab->ma); 609 free_elem_count: 610 bpf_map_free_elem_count(&htab->map); 611 free_htab: 612 lockdep_unregister_key(&htab->lockdep_key); 613 bpf_map_area_free(htab); 614 return ERR_PTR(err); 615 } 616 617 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) 618 { 619 if (likely(key_len % 4 == 0)) 620 return jhash2(key, key_len / 4, hashrnd); 621 return jhash(key, key_len, hashrnd); 622 } 623 624 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 625 { 626 return &htab->buckets[hash & (htab->n_buckets - 1)]; 627 } 628 629 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 630 { 631 return &__select_bucket(htab, hash)->head; 632 } 633 634 /* this lookup function can only be called with bucket lock taken */ 635 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 636 void *key, u32 key_size) 637 { 638 struct hlist_nulls_node *n; 639 struct htab_elem *l; 640 641 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 642 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 643 return l; 644 645 return NULL; 646 } 647 648 /* can be called without bucket lock. it will repeat the loop in 649 * the unlikely event when elements moved from one bucket into another 650 * while link list is being walked 651 */ 652 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 653 u32 hash, void *key, 654 u32 key_size, u32 n_buckets) 655 { 656 struct hlist_nulls_node *n; 657 struct htab_elem *l; 658 659 again: 660 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 661 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 662 return l; 663 664 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 665 goto again; 666 667 return NULL; 668 } 669 670 /* Called from syscall or from eBPF program directly, so 671 * arguments have to match bpf_map_lookup_elem() exactly. 672 * The return value is adjusted by BPF instructions 673 * in htab_map_gen_lookup(). 674 */ 675 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 676 { 677 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 678 struct hlist_nulls_head *head; 679 struct htab_elem *l; 680 u32 hash, key_size; 681 682 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 683 !rcu_read_lock_bh_held()); 684 685 key_size = map->key_size; 686 687 hash = htab_map_hash(key, key_size, htab->hashrnd); 688 689 head = select_bucket(htab, hash); 690 691 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 692 693 return l; 694 } 695 696 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 697 { 698 struct htab_elem *l = __htab_map_lookup_elem(map, key); 699 700 if (l) 701 return l->key + round_up(map->key_size, 8); 702 703 return NULL; 704 } 705 706 /* inline bpf_map_lookup_elem() call. 707 * Instead of: 708 * bpf_prog 709 * bpf_map_lookup_elem 710 * map->ops->map_lookup_elem 711 * htab_map_lookup_elem 712 * __htab_map_lookup_elem 713 * do: 714 * bpf_prog 715 * __htab_map_lookup_elem 716 */ 717 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 718 { 719 struct bpf_insn *insn = insn_buf; 720 const int ret = BPF_REG_0; 721 722 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 723 (void *(*)(struct bpf_map *map, void *key))NULL)); 724 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 725 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 726 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 727 offsetof(struct htab_elem, key) + 728 round_up(map->key_size, 8)); 729 return insn - insn_buf; 730 } 731 732 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, 733 void *key, const bool mark) 734 { 735 struct htab_elem *l = __htab_map_lookup_elem(map, key); 736 737 if (l) { 738 if (mark) 739 bpf_lru_node_set_ref(&l->lru_node); 740 return l->key + round_up(map->key_size, 8); 741 } 742 743 return NULL; 744 } 745 746 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 747 { 748 return __htab_lru_map_lookup_elem(map, key, true); 749 } 750 751 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) 752 { 753 return __htab_lru_map_lookup_elem(map, key, false); 754 } 755 756 static int htab_lru_map_gen_lookup(struct bpf_map *map, 757 struct bpf_insn *insn_buf) 758 { 759 struct bpf_insn *insn = insn_buf; 760 const int ret = BPF_REG_0; 761 const int ref_reg = BPF_REG_1; 762 763 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 764 (void *(*)(struct bpf_map *map, void *key))NULL)); 765 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 766 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); 767 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, 768 offsetof(struct htab_elem, lru_node) + 769 offsetof(struct bpf_lru_node, ref)); 770 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); 771 *insn++ = BPF_ST_MEM(BPF_B, ret, 772 offsetof(struct htab_elem, lru_node) + 773 offsetof(struct bpf_lru_node, ref), 774 1); 775 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 776 offsetof(struct htab_elem, key) + 777 round_up(map->key_size, 8)); 778 return insn - insn_buf; 779 } 780 781 static void check_and_free_fields(struct bpf_htab *htab, 782 struct htab_elem *elem) 783 { 784 if (htab_is_percpu(htab)) { 785 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); 786 int cpu; 787 788 for_each_possible_cpu(cpu) 789 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); 790 } else { 791 void *map_value = elem->key + round_up(htab->map.key_size, 8); 792 793 bpf_obj_free_fields(htab->map.record, map_value); 794 } 795 } 796 797 /* It is called from the bpf_lru_list when the LRU needs to delete 798 * older elements from the htab. 799 */ 800 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 801 { 802 struct bpf_htab *htab = arg; 803 struct htab_elem *l = NULL, *tgt_l; 804 struct hlist_nulls_head *head; 805 struct hlist_nulls_node *n; 806 unsigned long flags; 807 struct bucket *b; 808 int ret; 809 810 tgt_l = container_of(node, struct htab_elem, lru_node); 811 b = __select_bucket(htab, tgt_l->hash); 812 head = &b->head; 813 814 ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); 815 if (ret) 816 return false; 817 818 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 819 if (l == tgt_l) { 820 hlist_nulls_del_rcu(&l->hash_node); 821 check_and_free_fields(htab, l); 822 bpf_map_dec_elem_count(&htab->map); 823 break; 824 } 825 826 htab_unlock_bucket(htab, b, tgt_l->hash, flags); 827 828 return l == tgt_l; 829 } 830 831 /* Called from syscall */ 832 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 833 { 834 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 835 struct hlist_nulls_head *head; 836 struct htab_elem *l, *next_l; 837 u32 hash, key_size; 838 int i = 0; 839 840 WARN_ON_ONCE(!rcu_read_lock_held()); 841 842 key_size = map->key_size; 843 844 if (!key) 845 goto find_first_elem; 846 847 hash = htab_map_hash(key, key_size, htab->hashrnd); 848 849 head = select_bucket(htab, hash); 850 851 /* lookup the key */ 852 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 853 854 if (!l) 855 goto find_first_elem; 856 857 /* key was found, get next key in the same bucket */ 858 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 859 struct htab_elem, hash_node); 860 861 if (next_l) { 862 /* if next elem in this hash list is non-zero, just return it */ 863 memcpy(next_key, next_l->key, key_size); 864 return 0; 865 } 866 867 /* no more elements in this hash list, go to the next bucket */ 868 i = hash & (htab->n_buckets - 1); 869 i++; 870 871 find_first_elem: 872 /* iterate over buckets */ 873 for (; i < htab->n_buckets; i++) { 874 head = select_bucket(htab, i); 875 876 /* pick first element in the bucket */ 877 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 878 struct htab_elem, hash_node); 879 if (next_l) { 880 /* if it's not empty, just return it */ 881 memcpy(next_key, next_l->key, key_size); 882 return 0; 883 } 884 } 885 886 /* iterated over all buckets and all elements */ 887 return -ENOENT; 888 } 889 890 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 891 { 892 check_and_free_fields(htab, l); 893 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 894 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); 895 bpf_mem_cache_free(&htab->ma, l); 896 } 897 898 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) 899 { 900 struct bpf_map *map = &htab->map; 901 void *ptr; 902 903 if (map->ops->map_fd_put_ptr) { 904 ptr = fd_htab_map_get_ptr(map, l); 905 map->ops->map_fd_put_ptr(map, ptr, true); 906 } 907 } 908 909 static bool is_map_full(struct bpf_htab *htab) 910 { 911 if (htab->use_percpu_counter) 912 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, 913 PERCPU_COUNTER_BATCH) >= 0; 914 return atomic_read(&htab->count) >= htab->map.max_entries; 915 } 916 917 static void inc_elem_count(struct bpf_htab *htab) 918 { 919 bpf_map_inc_elem_count(&htab->map); 920 921 if (htab->use_percpu_counter) 922 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); 923 else 924 atomic_inc(&htab->count); 925 } 926 927 static void dec_elem_count(struct bpf_htab *htab) 928 { 929 bpf_map_dec_elem_count(&htab->map); 930 931 if (htab->use_percpu_counter) 932 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); 933 else 934 atomic_dec(&htab->count); 935 } 936 937 938 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 939 { 940 htab_put_fd_value(htab, l); 941 942 if (htab_is_prealloc(htab)) { 943 bpf_map_dec_elem_count(&htab->map); 944 check_and_free_fields(htab, l); 945 __pcpu_freelist_push(&htab->freelist, &l->fnode); 946 } else { 947 dec_elem_count(htab); 948 htab_elem_free(htab, l); 949 } 950 } 951 952 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 953 void *value, bool onallcpus) 954 { 955 if (!onallcpus) { 956 /* copy true value_size bytes */ 957 copy_map_value(&htab->map, this_cpu_ptr(pptr), value); 958 } else { 959 u32 size = round_up(htab->map.value_size, 8); 960 int off = 0, cpu; 961 962 for_each_possible_cpu(cpu) { 963 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off); 964 off += size; 965 } 966 } 967 } 968 969 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, 970 void *value, bool onallcpus) 971 { 972 /* When not setting the initial value on all cpus, zero-fill element 973 * values for other cpus. Otherwise, bpf program has no way to ensure 974 * known initial values for cpus other than current one 975 * (onallcpus=false always when coming from bpf prog). 976 */ 977 if (!onallcpus) { 978 int current_cpu = raw_smp_processor_id(); 979 int cpu; 980 981 for_each_possible_cpu(cpu) { 982 if (cpu == current_cpu) 983 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); 984 else /* Since elem is preallocated, we cannot touch special fields */ 985 zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); 986 } 987 } else { 988 pcpu_copy_value(htab, pptr, value, onallcpus); 989 } 990 } 991 992 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) 993 { 994 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && 995 BITS_PER_LONG == 64; 996 } 997 998 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 999 void *value, u32 key_size, u32 hash, 1000 bool percpu, bool onallcpus, 1001 struct htab_elem *old_elem) 1002 { 1003 u32 size = htab->map.value_size; 1004 bool prealloc = htab_is_prealloc(htab); 1005 struct htab_elem *l_new, **pl_new; 1006 void __percpu *pptr; 1007 1008 if (prealloc) { 1009 if (old_elem) { 1010 /* if we're updating the existing element, 1011 * use per-cpu extra elems to avoid freelist_pop/push 1012 */ 1013 pl_new = this_cpu_ptr(htab->extra_elems); 1014 l_new = *pl_new; 1015 htab_put_fd_value(htab, old_elem); 1016 *pl_new = old_elem; 1017 } else { 1018 struct pcpu_freelist_node *l; 1019 1020 l = __pcpu_freelist_pop(&htab->freelist); 1021 if (!l) 1022 return ERR_PTR(-E2BIG); 1023 l_new = container_of(l, struct htab_elem, fnode); 1024 bpf_map_inc_elem_count(&htab->map); 1025 } 1026 } else { 1027 if (is_map_full(htab)) 1028 if (!old_elem) 1029 /* when map is full and update() is replacing 1030 * old element, it's ok to allocate, since 1031 * old element will be freed immediately. 1032 * Otherwise return an error 1033 */ 1034 return ERR_PTR(-E2BIG); 1035 inc_elem_count(htab); 1036 l_new = bpf_mem_cache_alloc(&htab->ma); 1037 if (!l_new) { 1038 l_new = ERR_PTR(-ENOMEM); 1039 goto dec_count; 1040 } 1041 } 1042 1043 memcpy(l_new->key, key, key_size); 1044 if (percpu) { 1045 if (prealloc) { 1046 pptr = htab_elem_get_ptr(l_new, key_size); 1047 } else { 1048 /* alloc_percpu zero-fills */ 1049 pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); 1050 if (!pptr) { 1051 bpf_mem_cache_free(&htab->ma, l_new); 1052 l_new = ERR_PTR(-ENOMEM); 1053 goto dec_count; 1054 } 1055 l_new->ptr_to_pptr = pptr; 1056 pptr = *(void **)pptr; 1057 } 1058 1059 pcpu_init_value(htab, pptr, value, onallcpus); 1060 1061 if (!prealloc) 1062 htab_elem_set_ptr(l_new, key_size, pptr); 1063 } else if (fd_htab_map_needs_adjust(htab)) { 1064 size = round_up(size, 8); 1065 memcpy(l_new->key + round_up(key_size, 8), value, size); 1066 } else { 1067 copy_map_value(&htab->map, 1068 l_new->key + round_up(key_size, 8), 1069 value); 1070 } 1071 1072 l_new->hash = hash; 1073 return l_new; 1074 dec_count: 1075 dec_elem_count(htab); 1076 return l_new; 1077 } 1078 1079 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 1080 u64 map_flags) 1081 { 1082 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) 1083 /* elem already exists */ 1084 return -EEXIST; 1085 1086 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) 1087 /* elem doesn't exist, cannot update it */ 1088 return -ENOENT; 1089 1090 return 0; 1091 } 1092 1093 /* Called from syscall or from eBPF program */ 1094 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, 1095 u64 map_flags) 1096 { 1097 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1098 struct htab_elem *l_new = NULL, *l_old; 1099 struct hlist_nulls_head *head; 1100 unsigned long flags; 1101 struct bucket *b; 1102 u32 key_size, hash; 1103 int ret; 1104 1105 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 1106 /* unknown flags */ 1107 return -EINVAL; 1108 1109 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1110 !rcu_read_lock_bh_held()); 1111 1112 key_size = map->key_size; 1113 1114 hash = htab_map_hash(key, key_size, htab->hashrnd); 1115 1116 b = __select_bucket(htab, hash); 1117 head = &b->head; 1118 1119 if (unlikely(map_flags & BPF_F_LOCK)) { 1120 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1121 return -EINVAL; 1122 /* find an element without taking the bucket lock */ 1123 l_old = lookup_nulls_elem_raw(head, hash, key, key_size, 1124 htab->n_buckets); 1125 ret = check_flags(htab, l_old, map_flags); 1126 if (ret) 1127 return ret; 1128 if (l_old) { 1129 /* grab the element lock and update value in place */ 1130 copy_map_value_locked(map, 1131 l_old->key + round_up(key_size, 8), 1132 value, false); 1133 return 0; 1134 } 1135 /* fall through, grab the bucket lock and lookup again. 1136 * 99.9% chance that the element won't be found, 1137 * but second lookup under lock has to be done. 1138 */ 1139 } 1140 1141 ret = htab_lock_bucket(htab, b, hash, &flags); 1142 if (ret) 1143 return ret; 1144 1145 l_old = lookup_elem_raw(head, hash, key, key_size); 1146 1147 ret = check_flags(htab, l_old, map_flags); 1148 if (ret) 1149 goto err; 1150 1151 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { 1152 /* first lookup without the bucket lock didn't find the element, 1153 * but second lookup with the bucket lock found it. 1154 * This case is highly unlikely, but has to be dealt with: 1155 * grab the element lock in addition to the bucket lock 1156 * and update element in place 1157 */ 1158 copy_map_value_locked(map, 1159 l_old->key + round_up(key_size, 8), 1160 value, false); 1161 ret = 0; 1162 goto err; 1163 } 1164 1165 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 1166 l_old); 1167 if (IS_ERR(l_new)) { 1168 /* all pre-allocated elements are in use or memory exhausted */ 1169 ret = PTR_ERR(l_new); 1170 goto err; 1171 } 1172 1173 /* add new element to the head of the list, so that 1174 * concurrent search will find it before old elem 1175 */ 1176 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1177 if (l_old) { 1178 hlist_nulls_del_rcu(&l_old->hash_node); 1179 if (!htab_is_prealloc(htab)) 1180 free_htab_elem(htab, l_old); 1181 else 1182 check_and_free_fields(htab, l_old); 1183 } 1184 ret = 0; 1185 err: 1186 htab_unlock_bucket(htab, b, hash, flags); 1187 return ret; 1188 } 1189 1190 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) 1191 { 1192 check_and_free_fields(htab, elem); 1193 bpf_map_dec_elem_count(&htab->map); 1194 bpf_lru_push_free(&htab->lru, &elem->lru_node); 1195 } 1196 1197 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 1198 u64 map_flags) 1199 { 1200 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1201 struct htab_elem *l_new, *l_old = NULL; 1202 struct hlist_nulls_head *head; 1203 unsigned long flags; 1204 struct bucket *b; 1205 u32 key_size, hash; 1206 int ret; 1207 1208 if (unlikely(map_flags > BPF_EXIST)) 1209 /* unknown flags */ 1210 return -EINVAL; 1211 1212 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1213 !rcu_read_lock_bh_held()); 1214 1215 key_size = map->key_size; 1216 1217 hash = htab_map_hash(key, key_size, htab->hashrnd); 1218 1219 b = __select_bucket(htab, hash); 1220 head = &b->head; 1221 1222 /* For LRU, we need to alloc before taking bucket's 1223 * spinlock because getting free nodes from LRU may need 1224 * to remove older elements from htab and this removal 1225 * operation will need a bucket lock. 1226 */ 1227 l_new = prealloc_lru_pop(htab, key, hash); 1228 if (!l_new) 1229 return -ENOMEM; 1230 copy_map_value(&htab->map, 1231 l_new->key + round_up(map->key_size, 8), value); 1232 1233 ret = htab_lock_bucket(htab, b, hash, &flags); 1234 if (ret) 1235 goto err_lock_bucket; 1236 1237 l_old = lookup_elem_raw(head, hash, key, key_size); 1238 1239 ret = check_flags(htab, l_old, map_flags); 1240 if (ret) 1241 goto err; 1242 1243 /* add new element to the head of the list, so that 1244 * concurrent search will find it before old elem 1245 */ 1246 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1247 if (l_old) { 1248 bpf_lru_node_set_ref(&l_new->lru_node); 1249 hlist_nulls_del_rcu(&l_old->hash_node); 1250 } 1251 ret = 0; 1252 1253 err: 1254 htab_unlock_bucket(htab, b, hash, flags); 1255 1256 err_lock_bucket: 1257 if (ret) 1258 htab_lru_push_free(htab, l_new); 1259 else if (l_old) 1260 htab_lru_push_free(htab, l_old); 1261 1262 return ret; 1263 } 1264 1265 static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1266 void *value, u64 map_flags, 1267 bool onallcpus) 1268 { 1269 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1270 struct htab_elem *l_new = NULL, *l_old; 1271 struct hlist_nulls_head *head; 1272 unsigned long flags; 1273 struct bucket *b; 1274 u32 key_size, hash; 1275 int ret; 1276 1277 if (unlikely(map_flags > BPF_EXIST)) 1278 /* unknown flags */ 1279 return -EINVAL; 1280 1281 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1282 !rcu_read_lock_bh_held()); 1283 1284 key_size = map->key_size; 1285 1286 hash = htab_map_hash(key, key_size, htab->hashrnd); 1287 1288 b = __select_bucket(htab, hash); 1289 head = &b->head; 1290 1291 ret = htab_lock_bucket(htab, b, hash, &flags); 1292 if (ret) 1293 return ret; 1294 1295 l_old = lookup_elem_raw(head, hash, key, key_size); 1296 1297 ret = check_flags(htab, l_old, map_flags); 1298 if (ret) 1299 goto err; 1300 1301 if (l_old) { 1302 /* per-cpu hash map can update value in-place */ 1303 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1304 value, onallcpus); 1305 } else { 1306 l_new = alloc_htab_elem(htab, key, value, key_size, 1307 hash, true, onallcpus, NULL); 1308 if (IS_ERR(l_new)) { 1309 ret = PTR_ERR(l_new); 1310 goto err; 1311 } 1312 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1313 } 1314 ret = 0; 1315 err: 1316 htab_unlock_bucket(htab, b, hash, flags); 1317 return ret; 1318 } 1319 1320 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1321 void *value, u64 map_flags, 1322 bool onallcpus) 1323 { 1324 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1325 struct htab_elem *l_new = NULL, *l_old; 1326 struct hlist_nulls_head *head; 1327 unsigned long flags; 1328 struct bucket *b; 1329 u32 key_size, hash; 1330 int ret; 1331 1332 if (unlikely(map_flags > BPF_EXIST)) 1333 /* unknown flags */ 1334 return -EINVAL; 1335 1336 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1337 !rcu_read_lock_bh_held()); 1338 1339 key_size = map->key_size; 1340 1341 hash = htab_map_hash(key, key_size, htab->hashrnd); 1342 1343 b = __select_bucket(htab, hash); 1344 head = &b->head; 1345 1346 /* For LRU, we need to alloc before taking bucket's 1347 * spinlock because LRU's elem alloc may need 1348 * to remove older elem from htab and this removal 1349 * operation will need a bucket lock. 1350 */ 1351 if (map_flags != BPF_EXIST) { 1352 l_new = prealloc_lru_pop(htab, key, hash); 1353 if (!l_new) 1354 return -ENOMEM; 1355 } 1356 1357 ret = htab_lock_bucket(htab, b, hash, &flags); 1358 if (ret) 1359 goto err_lock_bucket; 1360 1361 l_old = lookup_elem_raw(head, hash, key, key_size); 1362 1363 ret = check_flags(htab, l_old, map_flags); 1364 if (ret) 1365 goto err; 1366 1367 if (l_old) { 1368 bpf_lru_node_set_ref(&l_old->lru_node); 1369 1370 /* per-cpu hash map can update value in-place */ 1371 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 1372 value, onallcpus); 1373 } else { 1374 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), 1375 value, onallcpus); 1376 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 1377 l_new = NULL; 1378 } 1379 ret = 0; 1380 err: 1381 htab_unlock_bucket(htab, b, hash, flags); 1382 err_lock_bucket: 1383 if (l_new) { 1384 bpf_map_dec_elem_count(&htab->map); 1385 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 1386 } 1387 return ret; 1388 } 1389 1390 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, 1391 void *value, u64 map_flags) 1392 { 1393 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 1394 } 1395 1396 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 1397 void *value, u64 map_flags) 1398 { 1399 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 1400 false); 1401 } 1402 1403 /* Called from syscall or from eBPF program */ 1404 static long htab_map_delete_elem(struct bpf_map *map, void *key) 1405 { 1406 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1407 struct hlist_nulls_head *head; 1408 struct bucket *b; 1409 struct htab_elem *l; 1410 unsigned long flags; 1411 u32 hash, key_size; 1412 int ret; 1413 1414 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1415 !rcu_read_lock_bh_held()); 1416 1417 key_size = map->key_size; 1418 1419 hash = htab_map_hash(key, key_size, htab->hashrnd); 1420 b = __select_bucket(htab, hash); 1421 head = &b->head; 1422 1423 ret = htab_lock_bucket(htab, b, hash, &flags); 1424 if (ret) 1425 return ret; 1426 1427 l = lookup_elem_raw(head, hash, key, key_size); 1428 1429 if (l) { 1430 hlist_nulls_del_rcu(&l->hash_node); 1431 free_htab_elem(htab, l); 1432 } else { 1433 ret = -ENOENT; 1434 } 1435 1436 htab_unlock_bucket(htab, b, hash, flags); 1437 return ret; 1438 } 1439 1440 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) 1441 { 1442 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1443 struct hlist_nulls_head *head; 1444 struct bucket *b; 1445 struct htab_elem *l; 1446 unsigned long flags; 1447 u32 hash, key_size; 1448 int ret; 1449 1450 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && 1451 !rcu_read_lock_bh_held()); 1452 1453 key_size = map->key_size; 1454 1455 hash = htab_map_hash(key, key_size, htab->hashrnd); 1456 b = __select_bucket(htab, hash); 1457 head = &b->head; 1458 1459 ret = htab_lock_bucket(htab, b, hash, &flags); 1460 if (ret) 1461 return ret; 1462 1463 l = lookup_elem_raw(head, hash, key, key_size); 1464 1465 if (l) 1466 hlist_nulls_del_rcu(&l->hash_node); 1467 else 1468 ret = -ENOENT; 1469 1470 htab_unlock_bucket(htab, b, hash, flags); 1471 if (l) 1472 htab_lru_push_free(htab, l); 1473 return ret; 1474 } 1475 1476 static void delete_all_elements(struct bpf_htab *htab) 1477 { 1478 int i; 1479 1480 /* It's called from a worker thread, so disable migration here, 1481 * since bpf_mem_cache_free() relies on that. 1482 */ 1483 migrate_disable(); 1484 for (i = 0; i < htab->n_buckets; i++) { 1485 struct hlist_nulls_head *head = select_bucket(htab, i); 1486 struct hlist_nulls_node *n; 1487 struct htab_elem *l; 1488 1489 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1490 hlist_nulls_del_rcu(&l->hash_node); 1491 htab_elem_free(htab, l); 1492 } 1493 } 1494 migrate_enable(); 1495 } 1496 1497 static void htab_free_malloced_timers(struct bpf_htab *htab) 1498 { 1499 int i; 1500 1501 rcu_read_lock(); 1502 for (i = 0; i < htab->n_buckets; i++) { 1503 struct hlist_nulls_head *head = select_bucket(htab, i); 1504 struct hlist_nulls_node *n; 1505 struct htab_elem *l; 1506 1507 hlist_nulls_for_each_entry(l, n, head, hash_node) { 1508 /* We only free timer on uref dropping to zero */ 1509 bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); 1510 } 1511 cond_resched_rcu(); 1512 } 1513 rcu_read_unlock(); 1514 } 1515 1516 static void htab_map_free_timers(struct bpf_map *map) 1517 { 1518 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1519 1520 /* We only free timer on uref dropping to zero */ 1521 if (!btf_record_has_field(htab->map.record, BPF_TIMER)) 1522 return; 1523 if (!htab_is_prealloc(htab)) 1524 htab_free_malloced_timers(htab); 1525 else 1526 htab_free_prealloced_timers(htab); 1527 } 1528 1529 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1530 static void htab_map_free(struct bpf_map *map) 1531 { 1532 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1533 int i; 1534 1535 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. 1536 * bpf_free_used_maps() is called after bpf prog is no longer executing. 1537 * There is no need to synchronize_rcu() here to protect map elements. 1538 */ 1539 1540 /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it 1541 * underneath and is reponsible for waiting for callbacks to finish 1542 * during bpf_mem_alloc_destroy(). 1543 */ 1544 if (!htab_is_prealloc(htab)) { 1545 delete_all_elements(htab); 1546 } else { 1547 htab_free_prealloced_fields(htab); 1548 prealloc_destroy(htab); 1549 } 1550 1551 bpf_map_free_elem_count(map); 1552 free_percpu(htab->extra_elems); 1553 bpf_map_area_free(htab->buckets); 1554 bpf_mem_alloc_destroy(&htab->pcpu_ma); 1555 bpf_mem_alloc_destroy(&htab->ma); 1556 if (htab->use_percpu_counter) 1557 percpu_counter_destroy(&htab->pcount); 1558 for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) 1559 free_percpu(htab->map_locked[i]); 1560 lockdep_unregister_key(&htab->lockdep_key); 1561 bpf_map_area_free(htab); 1562 } 1563 1564 static void htab_map_seq_show_elem(struct bpf_map *map, void *key, 1565 struct seq_file *m) 1566 { 1567 void *value; 1568 1569 rcu_read_lock(); 1570 1571 value = htab_map_lookup_elem(map, key); 1572 if (!value) { 1573 rcu_read_unlock(); 1574 return; 1575 } 1576 1577 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 1578 seq_puts(m, ": "); 1579 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 1580 seq_puts(m, "\n"); 1581 1582 rcu_read_unlock(); 1583 } 1584 1585 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1586 void *value, bool is_lru_map, 1587 bool is_percpu, u64 flags) 1588 { 1589 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1590 struct hlist_nulls_head *head; 1591 unsigned long bflags; 1592 struct htab_elem *l; 1593 u32 hash, key_size; 1594 struct bucket *b; 1595 int ret; 1596 1597 key_size = map->key_size; 1598 1599 hash = htab_map_hash(key, key_size, htab->hashrnd); 1600 b = __select_bucket(htab, hash); 1601 head = &b->head; 1602 1603 ret = htab_lock_bucket(htab, b, hash, &bflags); 1604 if (ret) 1605 return ret; 1606 1607 l = lookup_elem_raw(head, hash, key, key_size); 1608 if (!l) { 1609 ret = -ENOENT; 1610 } else { 1611 if (is_percpu) { 1612 u32 roundup_value_size = round_up(map->value_size, 8); 1613 void __percpu *pptr; 1614 int off = 0, cpu; 1615 1616 pptr = htab_elem_get_ptr(l, key_size); 1617 for_each_possible_cpu(cpu) { 1618 copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); 1619 check_and_init_map_value(&htab->map, value + off); 1620 off += roundup_value_size; 1621 } 1622 } else { 1623 u32 roundup_key_size = round_up(map->key_size, 8); 1624 1625 if (flags & BPF_F_LOCK) 1626 copy_map_value_locked(map, value, l->key + 1627 roundup_key_size, 1628 true); 1629 else 1630 copy_map_value(map, value, l->key + 1631 roundup_key_size); 1632 /* Zeroing special fields in the temp buffer */ 1633 check_and_init_map_value(map, value); 1634 } 1635 1636 hlist_nulls_del_rcu(&l->hash_node); 1637 if (!is_lru_map) 1638 free_htab_elem(htab, l); 1639 } 1640 1641 htab_unlock_bucket(htab, b, hash, bflags); 1642 1643 if (is_lru_map && l) 1644 htab_lru_push_free(htab, l); 1645 1646 return ret; 1647 } 1648 1649 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1650 void *value, u64 flags) 1651 { 1652 return __htab_map_lookup_and_delete_elem(map, key, value, false, false, 1653 flags); 1654 } 1655 1656 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1657 void *key, void *value, 1658 u64 flags) 1659 { 1660 return __htab_map_lookup_and_delete_elem(map, key, value, false, true, 1661 flags); 1662 } 1663 1664 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, 1665 void *value, u64 flags) 1666 { 1667 return __htab_map_lookup_and_delete_elem(map, key, value, true, false, 1668 flags); 1669 } 1670 1671 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, 1672 void *key, void *value, 1673 u64 flags) 1674 { 1675 return __htab_map_lookup_and_delete_elem(map, key, value, true, true, 1676 flags); 1677 } 1678 1679 static int 1680 __htab_map_lookup_and_delete_batch(struct bpf_map *map, 1681 const union bpf_attr *attr, 1682 union bpf_attr __user *uattr, 1683 bool do_delete, bool is_lru_map, 1684 bool is_percpu) 1685 { 1686 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1687 u32 bucket_cnt, total, key_size, value_size, roundup_key_size; 1688 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; 1689 void __user *uvalues = u64_to_user_ptr(attr->batch.values); 1690 void __user *ukeys = u64_to_user_ptr(attr->batch.keys); 1691 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); 1692 u32 batch, max_count, size, bucket_size, map_id; 1693 struct htab_elem *node_to_free = NULL; 1694 u64 elem_map_flags, map_flags; 1695 struct hlist_nulls_head *head; 1696 struct hlist_nulls_node *n; 1697 unsigned long flags = 0; 1698 bool locked = false; 1699 struct htab_elem *l; 1700 struct bucket *b; 1701 int ret = 0; 1702 1703 elem_map_flags = attr->batch.elem_flags; 1704 if ((elem_map_flags & ~BPF_F_LOCK) || 1705 ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) 1706 return -EINVAL; 1707 1708 map_flags = attr->batch.flags; 1709 if (map_flags) 1710 return -EINVAL; 1711 1712 max_count = attr->batch.count; 1713 if (!max_count) 1714 return 0; 1715 1716 if (put_user(0, &uattr->batch.count)) 1717 return -EFAULT; 1718 1719 batch = 0; 1720 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch))) 1721 return -EFAULT; 1722 1723 if (batch >= htab->n_buckets) 1724 return -ENOENT; 1725 1726 key_size = htab->map.key_size; 1727 roundup_key_size = round_up(htab->map.key_size, 8); 1728 value_size = htab->map.value_size; 1729 size = round_up(value_size, 8); 1730 if (is_percpu) 1731 value_size = size * num_possible_cpus(); 1732 total = 0; 1733 /* while experimenting with hash tables with sizes ranging from 10 to 1734 * 1000, it was observed that a bucket can have up to 5 entries. 1735 */ 1736 bucket_size = 5; 1737 1738 alloc: 1739 /* We cannot do copy_from_user or copy_to_user inside 1740 * the rcu_read_lock. Allocate enough space here. 1741 */ 1742 keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); 1743 values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); 1744 if (!keys || !values) { 1745 ret = -ENOMEM; 1746 goto after_loop; 1747 } 1748 1749 again: 1750 bpf_disable_instrumentation(); 1751 rcu_read_lock(); 1752 again_nocopy: 1753 dst_key = keys; 1754 dst_val = values; 1755 b = &htab->buckets[batch]; 1756 head = &b->head; 1757 /* do not grab the lock unless need it (bucket_cnt > 0). */ 1758 if (locked) { 1759 ret = htab_lock_bucket(htab, b, batch, &flags); 1760 if (ret) { 1761 rcu_read_unlock(); 1762 bpf_enable_instrumentation(); 1763 goto after_loop; 1764 } 1765 } 1766 1767 bucket_cnt = 0; 1768 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 1769 bucket_cnt++; 1770 1771 if (bucket_cnt && !locked) { 1772 locked = true; 1773 goto again_nocopy; 1774 } 1775 1776 if (bucket_cnt > (max_count - total)) { 1777 if (total == 0) 1778 ret = -ENOSPC; 1779 /* Note that since bucket_cnt > 0 here, it is implicit 1780 * that the locked was grabbed, so release it. 1781 */ 1782 htab_unlock_bucket(htab, b, batch, flags); 1783 rcu_read_unlock(); 1784 bpf_enable_instrumentation(); 1785 goto after_loop; 1786 } 1787 1788 if (bucket_cnt > bucket_size) { 1789 bucket_size = bucket_cnt; 1790 /* Note that since bucket_cnt > 0 here, it is implicit 1791 * that the locked was grabbed, so release it. 1792 */ 1793 htab_unlock_bucket(htab, b, batch, flags); 1794 rcu_read_unlock(); 1795 bpf_enable_instrumentation(); 1796 kvfree(keys); 1797 kvfree(values); 1798 goto alloc; 1799 } 1800 1801 /* Next block is only safe to run if you have grabbed the lock */ 1802 if (!locked) 1803 goto next_batch; 1804 1805 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1806 memcpy(dst_key, l->key, key_size); 1807 1808 if (is_percpu) { 1809 int off = 0, cpu; 1810 void __percpu *pptr; 1811 1812 pptr = htab_elem_get_ptr(l, map->key_size); 1813 for_each_possible_cpu(cpu) { 1814 copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); 1815 check_and_init_map_value(&htab->map, dst_val + off); 1816 off += size; 1817 } 1818 } else { 1819 value = l->key + roundup_key_size; 1820 if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { 1821 struct bpf_map **inner_map = value; 1822 1823 /* Actual value is the id of the inner map */ 1824 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); 1825 value = &map_id; 1826 } 1827 1828 if (elem_map_flags & BPF_F_LOCK) 1829 copy_map_value_locked(map, dst_val, value, 1830 true); 1831 else 1832 copy_map_value(map, dst_val, value); 1833 /* Zeroing special fields in the temp buffer */ 1834 check_and_init_map_value(map, dst_val); 1835 } 1836 if (do_delete) { 1837 hlist_nulls_del_rcu(&l->hash_node); 1838 1839 /* bpf_lru_push_free() will acquire lru_lock, which 1840 * may cause deadlock. See comments in function 1841 * prealloc_lru_pop(). Let us do bpf_lru_push_free() 1842 * after releasing the bucket lock. 1843 */ 1844 if (is_lru_map) { 1845 l->batch_flink = node_to_free; 1846 node_to_free = l; 1847 } else { 1848 free_htab_elem(htab, l); 1849 } 1850 } 1851 dst_key += key_size; 1852 dst_val += value_size; 1853 } 1854 1855 htab_unlock_bucket(htab, b, batch, flags); 1856 locked = false; 1857 1858 while (node_to_free) { 1859 l = node_to_free; 1860 node_to_free = node_to_free->batch_flink; 1861 htab_lru_push_free(htab, l); 1862 } 1863 1864 next_batch: 1865 /* If we are not copying data, we can go to next bucket and avoid 1866 * unlocking the rcu. 1867 */ 1868 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { 1869 batch++; 1870 goto again_nocopy; 1871 } 1872 1873 rcu_read_unlock(); 1874 bpf_enable_instrumentation(); 1875 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, 1876 key_size * bucket_cnt) || 1877 copy_to_user(uvalues + total * value_size, values, 1878 value_size * bucket_cnt))) { 1879 ret = -EFAULT; 1880 goto after_loop; 1881 } 1882 1883 total += bucket_cnt; 1884 batch++; 1885 if (batch >= htab->n_buckets) { 1886 ret = -ENOENT; 1887 goto after_loop; 1888 } 1889 goto again; 1890 1891 after_loop: 1892 if (ret == -EFAULT) 1893 goto out; 1894 1895 /* copy # of entries and next batch */ 1896 ubatch = u64_to_user_ptr(attr->batch.out_batch); 1897 if (copy_to_user(ubatch, &batch, sizeof(batch)) || 1898 put_user(total, &uattr->batch.count)) 1899 ret = -EFAULT; 1900 1901 out: 1902 kvfree(keys); 1903 kvfree(values); 1904 return ret; 1905 } 1906 1907 static int 1908 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1909 union bpf_attr __user *uattr) 1910 { 1911 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1912 false, true); 1913 } 1914 1915 static int 1916 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1917 const union bpf_attr *attr, 1918 union bpf_attr __user *uattr) 1919 { 1920 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1921 false, true); 1922 } 1923 1924 static int 1925 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1926 union bpf_attr __user *uattr) 1927 { 1928 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1929 false, false); 1930 } 1931 1932 static int 1933 htab_map_lookup_and_delete_batch(struct bpf_map *map, 1934 const union bpf_attr *attr, 1935 union bpf_attr __user *uattr) 1936 { 1937 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1938 false, false); 1939 } 1940 1941 static int 1942 htab_lru_percpu_map_lookup_batch(struct bpf_map *map, 1943 const union bpf_attr *attr, 1944 union bpf_attr __user *uattr) 1945 { 1946 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1947 true, true); 1948 } 1949 1950 static int 1951 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, 1952 const union bpf_attr *attr, 1953 union bpf_attr __user *uattr) 1954 { 1955 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1956 true, true); 1957 } 1958 1959 static int 1960 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, 1961 union bpf_attr __user *uattr) 1962 { 1963 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false, 1964 true, false); 1965 } 1966 1967 static int 1968 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, 1969 const union bpf_attr *attr, 1970 union bpf_attr __user *uattr) 1971 { 1972 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true, 1973 true, false); 1974 } 1975 1976 struct bpf_iter_seq_hash_map_info { 1977 struct bpf_map *map; 1978 struct bpf_htab *htab; 1979 void *percpu_value_buf; // non-zero means percpu hash 1980 u32 bucket_id; 1981 u32 skip_elems; 1982 }; 1983 1984 static struct htab_elem * 1985 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, 1986 struct htab_elem *prev_elem) 1987 { 1988 const struct bpf_htab *htab = info->htab; 1989 u32 skip_elems = info->skip_elems; 1990 u32 bucket_id = info->bucket_id; 1991 struct hlist_nulls_head *head; 1992 struct hlist_nulls_node *n; 1993 struct htab_elem *elem; 1994 struct bucket *b; 1995 u32 i, count; 1996 1997 if (bucket_id >= htab->n_buckets) 1998 return NULL; 1999 2000 /* try to find next elem in the same bucket */ 2001 if (prev_elem) { 2002 /* no update/deletion on this bucket, prev_elem should be still valid 2003 * and we won't skip elements. 2004 */ 2005 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); 2006 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); 2007 if (elem) 2008 return elem; 2009 2010 /* not found, unlock and go to the next bucket */ 2011 b = &htab->buckets[bucket_id++]; 2012 rcu_read_unlock(); 2013 skip_elems = 0; 2014 } 2015 2016 for (i = bucket_id; i < htab->n_buckets; i++) { 2017 b = &htab->buckets[i]; 2018 rcu_read_lock(); 2019 2020 count = 0; 2021 head = &b->head; 2022 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2023 if (count >= skip_elems) { 2024 info->bucket_id = i; 2025 info->skip_elems = count; 2026 return elem; 2027 } 2028 count++; 2029 } 2030 2031 rcu_read_unlock(); 2032 skip_elems = 0; 2033 } 2034 2035 info->bucket_id = i; 2036 info->skip_elems = 0; 2037 return NULL; 2038 } 2039 2040 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) 2041 { 2042 struct bpf_iter_seq_hash_map_info *info = seq->private; 2043 struct htab_elem *elem; 2044 2045 elem = bpf_hash_map_seq_find_next(info, NULL); 2046 if (!elem) 2047 return NULL; 2048 2049 if (*pos == 0) 2050 ++*pos; 2051 return elem; 2052 } 2053 2054 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2055 { 2056 struct bpf_iter_seq_hash_map_info *info = seq->private; 2057 2058 ++*pos; 2059 ++info->skip_elems; 2060 return bpf_hash_map_seq_find_next(info, v); 2061 } 2062 2063 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) 2064 { 2065 struct bpf_iter_seq_hash_map_info *info = seq->private; 2066 u32 roundup_key_size, roundup_value_size; 2067 struct bpf_iter__bpf_map_elem ctx = {}; 2068 struct bpf_map *map = info->map; 2069 struct bpf_iter_meta meta; 2070 int ret = 0, off = 0, cpu; 2071 struct bpf_prog *prog; 2072 void __percpu *pptr; 2073 2074 meta.seq = seq; 2075 prog = bpf_iter_get_info(&meta, elem == NULL); 2076 if (prog) { 2077 ctx.meta = &meta; 2078 ctx.map = info->map; 2079 if (elem) { 2080 roundup_key_size = round_up(map->key_size, 8); 2081 ctx.key = elem->key; 2082 if (!info->percpu_value_buf) { 2083 ctx.value = elem->key + roundup_key_size; 2084 } else { 2085 roundup_value_size = round_up(map->value_size, 8); 2086 pptr = htab_elem_get_ptr(elem, map->key_size); 2087 for_each_possible_cpu(cpu) { 2088 copy_map_value_long(map, info->percpu_value_buf + off, 2089 per_cpu_ptr(pptr, cpu)); 2090 check_and_init_map_value(map, info->percpu_value_buf + off); 2091 off += roundup_value_size; 2092 } 2093 ctx.value = info->percpu_value_buf; 2094 } 2095 } 2096 ret = bpf_iter_run_prog(prog, &ctx); 2097 } 2098 2099 return ret; 2100 } 2101 2102 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) 2103 { 2104 return __bpf_hash_map_seq_show(seq, v); 2105 } 2106 2107 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) 2108 { 2109 if (!v) 2110 (void)__bpf_hash_map_seq_show(seq, NULL); 2111 else 2112 rcu_read_unlock(); 2113 } 2114 2115 static int bpf_iter_init_hash_map(void *priv_data, 2116 struct bpf_iter_aux_info *aux) 2117 { 2118 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2119 struct bpf_map *map = aux->map; 2120 void *value_buf; 2121 u32 buf_size; 2122 2123 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 2124 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 2125 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 2126 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 2127 if (!value_buf) 2128 return -ENOMEM; 2129 2130 seq_info->percpu_value_buf = value_buf; 2131 } 2132 2133 bpf_map_inc_with_uref(map); 2134 seq_info->map = map; 2135 seq_info->htab = container_of(map, struct bpf_htab, map); 2136 return 0; 2137 } 2138 2139 static void bpf_iter_fini_hash_map(void *priv_data) 2140 { 2141 struct bpf_iter_seq_hash_map_info *seq_info = priv_data; 2142 2143 bpf_map_put_with_uref(seq_info->map); 2144 kfree(seq_info->percpu_value_buf); 2145 } 2146 2147 static const struct seq_operations bpf_hash_map_seq_ops = { 2148 .start = bpf_hash_map_seq_start, 2149 .next = bpf_hash_map_seq_next, 2150 .stop = bpf_hash_map_seq_stop, 2151 .show = bpf_hash_map_seq_show, 2152 }; 2153 2154 static const struct bpf_iter_seq_info iter_seq_info = { 2155 .seq_ops = &bpf_hash_map_seq_ops, 2156 .init_seq_private = bpf_iter_init_hash_map, 2157 .fini_seq_private = bpf_iter_fini_hash_map, 2158 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), 2159 }; 2160 2161 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, 2162 void *callback_ctx, u64 flags) 2163 { 2164 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2165 struct hlist_nulls_head *head; 2166 struct hlist_nulls_node *n; 2167 struct htab_elem *elem; 2168 u32 roundup_key_size; 2169 int i, num_elems = 0; 2170 void __percpu *pptr; 2171 struct bucket *b; 2172 void *key, *val; 2173 bool is_percpu; 2174 u64 ret = 0; 2175 2176 if (flags != 0) 2177 return -EINVAL; 2178 2179 is_percpu = htab_is_percpu(htab); 2180 2181 roundup_key_size = round_up(map->key_size, 8); 2182 /* disable migration so percpu value prepared here will be the 2183 * same as the one seen by the bpf program with bpf_map_lookup_elem(). 2184 */ 2185 if (is_percpu) 2186 migrate_disable(); 2187 for (i = 0; i < htab->n_buckets; i++) { 2188 b = &htab->buckets[i]; 2189 rcu_read_lock(); 2190 head = &b->head; 2191 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { 2192 key = elem->key; 2193 if (is_percpu) { 2194 /* current cpu value for percpu map */ 2195 pptr = htab_elem_get_ptr(elem, map->key_size); 2196 val = this_cpu_ptr(pptr); 2197 } else { 2198 val = elem->key + roundup_key_size; 2199 } 2200 num_elems++; 2201 ret = callback_fn((u64)(long)map, (u64)(long)key, 2202 (u64)(long)val, (u64)(long)callback_ctx, 0); 2203 /* return value: 0 - continue, 1 - stop and return */ 2204 if (ret) { 2205 rcu_read_unlock(); 2206 goto out; 2207 } 2208 } 2209 rcu_read_unlock(); 2210 } 2211 out: 2212 if (is_percpu) 2213 migrate_enable(); 2214 return num_elems; 2215 } 2216 2217 static u64 htab_map_mem_usage(const struct bpf_map *map) 2218 { 2219 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2220 u32 value_size = round_up(htab->map.value_size, 8); 2221 bool prealloc = htab_is_prealloc(htab); 2222 bool percpu = htab_is_percpu(htab); 2223 bool lru = htab_is_lru(htab); 2224 u64 num_entries; 2225 u64 usage = sizeof(struct bpf_htab); 2226 2227 usage += sizeof(struct bucket) * htab->n_buckets; 2228 usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; 2229 if (prealloc) { 2230 num_entries = map->max_entries; 2231 if (htab_has_extra_elems(htab)) 2232 num_entries += num_possible_cpus(); 2233 2234 usage += htab->elem_size * num_entries; 2235 2236 if (percpu) 2237 usage += value_size * num_possible_cpus() * num_entries; 2238 else if (!lru) 2239 usage += sizeof(struct htab_elem *) * num_possible_cpus(); 2240 } else { 2241 #define LLIST_NODE_SZ sizeof(struct llist_node) 2242 2243 num_entries = htab->use_percpu_counter ? 2244 percpu_counter_sum(&htab->pcount) : 2245 atomic_read(&htab->count); 2246 usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; 2247 if (percpu) { 2248 usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; 2249 usage += value_size * num_possible_cpus() * num_entries; 2250 } 2251 } 2252 return usage; 2253 } 2254 2255 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) 2256 const struct bpf_map_ops htab_map_ops = { 2257 .map_meta_equal = bpf_map_meta_equal, 2258 .map_alloc_check = htab_map_alloc_check, 2259 .map_alloc = htab_map_alloc, 2260 .map_free = htab_map_free, 2261 .map_get_next_key = htab_map_get_next_key, 2262 .map_release_uref = htab_map_free_timers, 2263 .map_lookup_elem = htab_map_lookup_elem, 2264 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, 2265 .map_update_elem = htab_map_update_elem, 2266 .map_delete_elem = htab_map_delete_elem, 2267 .map_gen_lookup = htab_map_gen_lookup, 2268 .map_seq_show_elem = htab_map_seq_show_elem, 2269 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2270 .map_for_each_callback = bpf_for_each_hash_elem, 2271 .map_mem_usage = htab_map_mem_usage, 2272 BATCH_OPS(htab), 2273 .map_btf_id = &htab_map_btf_ids[0], 2274 .iter_seq_info = &iter_seq_info, 2275 }; 2276 2277 const struct bpf_map_ops htab_lru_map_ops = { 2278 .map_meta_equal = bpf_map_meta_equal, 2279 .map_alloc_check = htab_map_alloc_check, 2280 .map_alloc = htab_map_alloc, 2281 .map_free = htab_map_free, 2282 .map_get_next_key = htab_map_get_next_key, 2283 .map_release_uref = htab_map_free_timers, 2284 .map_lookup_elem = htab_lru_map_lookup_elem, 2285 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, 2286 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, 2287 .map_update_elem = htab_lru_map_update_elem, 2288 .map_delete_elem = htab_lru_map_delete_elem, 2289 .map_gen_lookup = htab_lru_map_gen_lookup, 2290 .map_seq_show_elem = htab_map_seq_show_elem, 2291 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2292 .map_for_each_callback = bpf_for_each_hash_elem, 2293 .map_mem_usage = htab_map_mem_usage, 2294 BATCH_OPS(htab_lru), 2295 .map_btf_id = &htab_map_btf_ids[0], 2296 .iter_seq_info = &iter_seq_info, 2297 }; 2298 2299 /* Called from eBPF program */ 2300 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2301 { 2302 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2303 2304 if (l) 2305 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2306 else 2307 return NULL; 2308 } 2309 2310 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2311 { 2312 struct htab_elem *l; 2313 2314 if (cpu >= nr_cpu_ids) 2315 return NULL; 2316 2317 l = __htab_map_lookup_elem(map, key); 2318 if (l) 2319 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2320 else 2321 return NULL; 2322 } 2323 2324 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 2325 { 2326 struct htab_elem *l = __htab_map_lookup_elem(map, key); 2327 2328 if (l) { 2329 bpf_lru_node_set_ref(&l->lru_node); 2330 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 2331 } 2332 2333 return NULL; 2334 } 2335 2336 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) 2337 { 2338 struct htab_elem *l; 2339 2340 if (cpu >= nr_cpu_ids) 2341 return NULL; 2342 2343 l = __htab_map_lookup_elem(map, key); 2344 if (l) { 2345 bpf_lru_node_set_ref(&l->lru_node); 2346 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); 2347 } 2348 2349 return NULL; 2350 } 2351 2352 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 2353 { 2354 struct htab_elem *l; 2355 void __percpu *pptr; 2356 int ret = -ENOENT; 2357 int cpu, off = 0; 2358 u32 size; 2359 2360 /* per_cpu areas are zero-filled and bpf programs can only 2361 * access 'value_size' of them, so copying rounded areas 2362 * will not leak any kernel data 2363 */ 2364 size = round_up(map->value_size, 8); 2365 rcu_read_lock(); 2366 l = __htab_map_lookup_elem(map, key); 2367 if (!l) 2368 goto out; 2369 /* We do not mark LRU map element here in order to not mess up 2370 * eviction heuristics when user space does a map walk. 2371 */ 2372 pptr = htab_elem_get_ptr(l, map->key_size); 2373 for_each_possible_cpu(cpu) { 2374 copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); 2375 check_and_init_map_value(map, value + off); 2376 off += size; 2377 } 2378 ret = 0; 2379 out: 2380 rcu_read_unlock(); 2381 return ret; 2382 } 2383 2384 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 2385 u64 map_flags) 2386 { 2387 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2388 int ret; 2389 2390 rcu_read_lock(); 2391 if (htab_is_lru(htab)) 2392 ret = __htab_lru_percpu_map_update_elem(map, key, value, 2393 map_flags, true); 2394 else 2395 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 2396 true); 2397 rcu_read_unlock(); 2398 2399 return ret; 2400 } 2401 2402 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, 2403 struct seq_file *m) 2404 { 2405 struct htab_elem *l; 2406 void __percpu *pptr; 2407 int cpu; 2408 2409 rcu_read_lock(); 2410 2411 l = __htab_map_lookup_elem(map, key); 2412 if (!l) { 2413 rcu_read_unlock(); 2414 return; 2415 } 2416 2417 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 2418 seq_puts(m, ": {\n"); 2419 pptr = htab_elem_get_ptr(l, map->key_size); 2420 for_each_possible_cpu(cpu) { 2421 seq_printf(m, "\tcpu%d: ", cpu); 2422 btf_type_seq_show(map->btf, map->btf_value_type_id, 2423 per_cpu_ptr(pptr, cpu), m); 2424 seq_puts(m, "\n"); 2425 } 2426 seq_puts(m, "}\n"); 2427 2428 rcu_read_unlock(); 2429 } 2430 2431 const struct bpf_map_ops htab_percpu_map_ops = { 2432 .map_meta_equal = bpf_map_meta_equal, 2433 .map_alloc_check = htab_map_alloc_check, 2434 .map_alloc = htab_map_alloc, 2435 .map_free = htab_map_free, 2436 .map_get_next_key = htab_map_get_next_key, 2437 .map_lookup_elem = htab_percpu_map_lookup_elem, 2438 .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, 2439 .map_update_elem = htab_percpu_map_update_elem, 2440 .map_delete_elem = htab_map_delete_elem, 2441 .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, 2442 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2443 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2444 .map_for_each_callback = bpf_for_each_hash_elem, 2445 .map_mem_usage = htab_map_mem_usage, 2446 BATCH_OPS(htab_percpu), 2447 .map_btf_id = &htab_map_btf_ids[0], 2448 .iter_seq_info = &iter_seq_info, 2449 }; 2450 2451 const struct bpf_map_ops htab_lru_percpu_map_ops = { 2452 .map_meta_equal = bpf_map_meta_equal, 2453 .map_alloc_check = htab_map_alloc_check, 2454 .map_alloc = htab_map_alloc, 2455 .map_free = htab_map_free, 2456 .map_get_next_key = htab_map_get_next_key, 2457 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 2458 .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, 2459 .map_update_elem = htab_lru_percpu_map_update_elem, 2460 .map_delete_elem = htab_lru_map_delete_elem, 2461 .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, 2462 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2463 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2464 .map_for_each_callback = bpf_for_each_hash_elem, 2465 .map_mem_usage = htab_map_mem_usage, 2466 BATCH_OPS(htab_lru_percpu), 2467 .map_btf_id = &htab_map_btf_ids[0], 2468 .iter_seq_info = &iter_seq_info, 2469 }; 2470 2471 static int fd_htab_map_alloc_check(union bpf_attr *attr) 2472 { 2473 if (attr->value_size != sizeof(u32)) 2474 return -EINVAL; 2475 return htab_map_alloc_check(attr); 2476 } 2477 2478 static void fd_htab_map_free(struct bpf_map *map) 2479 { 2480 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 2481 struct hlist_nulls_node *n; 2482 struct hlist_nulls_head *head; 2483 struct htab_elem *l; 2484 int i; 2485 2486 for (i = 0; i < htab->n_buckets; i++) { 2487 head = select_bucket(htab, i); 2488 2489 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 2490 void *ptr = fd_htab_map_get_ptr(map, l); 2491 2492 map->ops->map_fd_put_ptr(map, ptr, false); 2493 } 2494 } 2495 2496 htab_map_free(map); 2497 } 2498 2499 /* only called from syscall */ 2500 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 2501 { 2502 void **ptr; 2503 int ret = 0; 2504 2505 if (!map->ops->map_fd_sys_lookup_elem) 2506 return -ENOTSUPP; 2507 2508 rcu_read_lock(); 2509 ptr = htab_map_lookup_elem(map, key); 2510 if (ptr) 2511 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); 2512 else 2513 ret = -ENOENT; 2514 rcu_read_unlock(); 2515 2516 return ret; 2517 } 2518 2519 /* only called from syscall */ 2520 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, 2521 void *key, void *value, u64 map_flags) 2522 { 2523 void *ptr; 2524 int ret; 2525 u32 ufd = *(u32 *)value; 2526 2527 ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 2528 if (IS_ERR(ptr)) 2529 return PTR_ERR(ptr); 2530 2531 /* The htab bucket lock is always held during update operations in fd 2532 * htab map, and the following rcu_read_lock() is only used to avoid 2533 * the WARN_ON_ONCE in htab_map_update_elem(). 2534 */ 2535 rcu_read_lock(); 2536 ret = htab_map_update_elem(map, key, &ptr, map_flags); 2537 rcu_read_unlock(); 2538 if (ret) 2539 map->ops->map_fd_put_ptr(map, ptr, false); 2540 2541 return ret; 2542 } 2543 2544 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) 2545 { 2546 struct bpf_map *map, *inner_map_meta; 2547 2548 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 2549 if (IS_ERR(inner_map_meta)) 2550 return inner_map_meta; 2551 2552 map = htab_map_alloc(attr); 2553 if (IS_ERR(map)) { 2554 bpf_map_meta_free(inner_map_meta); 2555 return map; 2556 } 2557 2558 map->inner_map_meta = inner_map_meta; 2559 2560 return map; 2561 } 2562 2563 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) 2564 { 2565 struct bpf_map **inner_map = htab_map_lookup_elem(map, key); 2566 2567 if (!inner_map) 2568 return NULL; 2569 2570 return READ_ONCE(*inner_map); 2571 } 2572 2573 static int htab_of_map_gen_lookup(struct bpf_map *map, 2574 struct bpf_insn *insn_buf) 2575 { 2576 struct bpf_insn *insn = insn_buf; 2577 const int ret = BPF_REG_0; 2578 2579 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, 2580 (void *(*)(struct bpf_map *map, void *key))NULL)); 2581 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); 2582 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); 2583 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, 2584 offsetof(struct htab_elem, key) + 2585 round_up(map->key_size, 8)); 2586 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 2587 2588 return insn - insn_buf; 2589 } 2590 2591 static void htab_of_map_free(struct bpf_map *map) 2592 { 2593 bpf_map_meta_free(map->inner_map_meta); 2594 fd_htab_map_free(map); 2595 } 2596 2597 const struct bpf_map_ops htab_of_maps_map_ops = { 2598 .map_alloc_check = fd_htab_map_alloc_check, 2599 .map_alloc = htab_of_map_alloc, 2600 .map_free = htab_of_map_free, 2601 .map_get_next_key = htab_map_get_next_key, 2602 .map_lookup_elem = htab_of_map_lookup_elem, 2603 .map_delete_elem = htab_map_delete_elem, 2604 .map_fd_get_ptr = bpf_map_fd_get_ptr, 2605 .map_fd_put_ptr = bpf_map_fd_put_ptr, 2606 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 2607 .map_gen_lookup = htab_of_map_gen_lookup, 2608 .map_check_btf = map_check_no_btf, 2609 .map_mem_usage = htab_map_mem_usage, 2610 BATCH_OPS(htab), 2611 .map_btf_id = &htab_map_btf_ids[0], 2612 }; 2613