1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * Copyright (c) 2016 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/bpf.h> 14 #include <linux/jhash.h> 15 #include <linux/filter.h> 16 #include <linux/rculist_nulls.h> 17 #include "percpu_freelist.h" 18 #include "bpf_lru_list.h" 19 20 struct bucket { 21 struct hlist_nulls_head head; 22 raw_spinlock_t lock; 23 }; 24 25 struct bpf_htab { 26 struct bpf_map map; 27 struct bucket *buckets; 28 void *elems; 29 union { 30 struct pcpu_freelist freelist; 31 struct bpf_lru lru; 32 }; 33 void __percpu *extra_elems; 34 atomic_t count; /* number of elements in this hashtable */ 35 u32 n_buckets; /* number of hash buckets */ 36 u32 elem_size; /* size of each element in bytes */ 37 }; 38 39 enum extra_elem_state { 40 HTAB_NOT_AN_EXTRA_ELEM = 0, 41 HTAB_EXTRA_ELEM_FREE, 42 HTAB_EXTRA_ELEM_USED 43 }; 44 45 /* each htab element is struct htab_elem + key + value */ 46 struct htab_elem { 47 union { 48 struct hlist_nulls_node hash_node; 49 struct { 50 void *padding; 51 union { 52 struct bpf_htab *htab; 53 struct pcpu_freelist_node fnode; 54 }; 55 }; 56 }; 57 union { 58 struct rcu_head rcu; 59 enum extra_elem_state state; 60 struct bpf_lru_node lru_node; 61 }; 62 u32 hash; 63 char key[0] __aligned(8); 64 }; 65 66 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); 67 68 static bool htab_is_lru(const struct bpf_htab *htab) 69 { 70 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || 71 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 72 } 73 74 static bool htab_is_percpu(const struct bpf_htab *htab) 75 { 76 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || 77 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; 78 } 79 80 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, 81 void __percpu *pptr) 82 { 83 *(void __percpu **)(l->key + key_size) = pptr; 84 } 85 86 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) 87 { 88 return *(void __percpu **)(l->key + key_size); 89 } 90 91 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) 92 { 93 return (struct htab_elem *) (htab->elems + i * htab->elem_size); 94 } 95 96 static void htab_free_elems(struct bpf_htab *htab) 97 { 98 int i; 99 100 if (!htab_is_percpu(htab)) 101 goto free_elems; 102 103 for (i = 0; i < htab->map.max_entries; i++) { 104 void __percpu *pptr; 105 106 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 107 htab->map.key_size); 108 free_percpu(pptr); 109 } 110 free_elems: 111 bpf_map_area_free(htab->elems); 112 } 113 114 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, 115 u32 hash) 116 { 117 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); 118 struct htab_elem *l; 119 120 if (node) { 121 l = container_of(node, struct htab_elem, lru_node); 122 memcpy(l->key, key, htab->map.key_size); 123 return l; 124 } 125 126 return NULL; 127 } 128 129 static int prealloc_init(struct bpf_htab *htab) 130 { 131 int err = -ENOMEM, i; 132 133 htab->elems = bpf_map_area_alloc(htab->elem_size * 134 htab->map.max_entries); 135 if (!htab->elems) 136 return -ENOMEM; 137 138 if (!htab_is_percpu(htab)) 139 goto skip_percpu_elems; 140 141 for (i = 0; i < htab->map.max_entries; i++) { 142 u32 size = round_up(htab->map.value_size, 8); 143 void __percpu *pptr; 144 145 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); 146 if (!pptr) 147 goto free_elems; 148 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 149 pptr); 150 } 151 152 skip_percpu_elems: 153 if (htab_is_lru(htab)) 154 err = bpf_lru_init(&htab->lru, 155 htab->map.map_flags & BPF_F_NO_COMMON_LRU, 156 offsetof(struct htab_elem, hash) - 157 offsetof(struct htab_elem, lru_node), 158 htab_lru_map_delete_node, 159 htab); 160 else 161 err = pcpu_freelist_init(&htab->freelist); 162 163 if (err) 164 goto free_elems; 165 166 if (htab_is_lru(htab)) 167 bpf_lru_populate(&htab->lru, htab->elems, 168 offsetof(struct htab_elem, lru_node), 169 htab->elem_size, htab->map.max_entries); 170 else 171 pcpu_freelist_populate(&htab->freelist, 172 htab->elems + offsetof(struct htab_elem, fnode), 173 htab->elem_size, htab->map.max_entries); 174 175 return 0; 176 177 free_elems: 178 htab_free_elems(htab); 179 return err; 180 } 181 182 static void prealloc_destroy(struct bpf_htab *htab) 183 { 184 htab_free_elems(htab); 185 186 if (htab_is_lru(htab)) 187 bpf_lru_destroy(&htab->lru); 188 else 189 pcpu_freelist_destroy(&htab->freelist); 190 } 191 192 static int alloc_extra_elems(struct bpf_htab *htab) 193 { 194 void __percpu *pptr; 195 int cpu; 196 197 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); 198 if (!pptr) 199 return -ENOMEM; 200 201 for_each_possible_cpu(cpu) { 202 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = 203 HTAB_EXTRA_ELEM_FREE; 204 } 205 htab->extra_elems = pptr; 206 return 0; 207 } 208 209 /* Called from syscall */ 210 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 211 { 212 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 213 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 214 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || 215 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 216 /* percpu_lru means each cpu has its own LRU list. 217 * it is different from BPF_MAP_TYPE_PERCPU_HASH where 218 * the map's value itself is percpu. percpu_lru has 219 * nothing to do with the map's value. 220 */ 221 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 222 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 223 struct bpf_htab *htab; 224 int err, i; 225 u64 cost; 226 227 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 228 offsetof(struct htab_elem, hash_node.pprev)); 229 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != 230 offsetof(struct htab_elem, hash_node.pprev)); 231 232 if (lru && !capable(CAP_SYS_ADMIN)) 233 /* LRU implementation is much complicated than other 234 * maps. Hence, limit to CAP_SYS_ADMIN for now. 235 */ 236 return ERR_PTR(-EPERM); 237 238 if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) 239 /* reserved bits should not be used */ 240 return ERR_PTR(-EINVAL); 241 242 if (!lru && percpu_lru) 243 return ERR_PTR(-EINVAL); 244 245 if (lru && !prealloc) 246 return ERR_PTR(-ENOTSUPP); 247 248 htab = kzalloc(sizeof(*htab), GFP_USER); 249 if (!htab) 250 return ERR_PTR(-ENOMEM); 251 252 /* mandatory map attributes */ 253 htab->map.map_type = attr->map_type; 254 htab->map.key_size = attr->key_size; 255 htab->map.value_size = attr->value_size; 256 htab->map.max_entries = attr->max_entries; 257 htab->map.map_flags = attr->map_flags; 258 259 /* check sanity of attributes. 260 * value_size == 0 may be allowed in the future to use map as a set 261 */ 262 err = -EINVAL; 263 if (htab->map.max_entries == 0 || htab->map.key_size == 0 || 264 htab->map.value_size == 0) 265 goto free_htab; 266 267 if (percpu_lru) { 268 /* ensure each CPU's lru list has >=1 elements. 269 * since we are at it, make each lru list has the same 270 * number of elements. 271 */ 272 htab->map.max_entries = roundup(attr->max_entries, 273 num_possible_cpus()); 274 if (htab->map.max_entries < attr->max_entries) 275 htab->map.max_entries = rounddown(attr->max_entries, 276 num_possible_cpus()); 277 } 278 279 /* hash table size must be power of 2 */ 280 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 281 282 err = -E2BIG; 283 if (htab->map.key_size > MAX_BPF_STACK) 284 /* eBPF programs initialize keys on stack, so they cannot be 285 * larger than max stack size 286 */ 287 goto free_htab; 288 289 if (htab->map.value_size >= KMALLOC_MAX_SIZE - 290 MAX_BPF_STACK - sizeof(struct htab_elem)) 291 /* if value_size is bigger, the user space won't be able to 292 * access the elements via bpf syscall. This check also makes 293 * sure that the elem_size doesn't overflow and it's 294 * kmalloc-able later in htab_map_update_elem() 295 */ 296 goto free_htab; 297 298 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) 299 /* make sure the size for pcpu_alloc() is reasonable */ 300 goto free_htab; 301 302 htab->elem_size = sizeof(struct htab_elem) + 303 round_up(htab->map.key_size, 8); 304 if (percpu) 305 htab->elem_size += sizeof(void *); 306 else 307 htab->elem_size += round_up(htab->map.value_size, 8); 308 309 /* prevent zero size kmalloc and check for u32 overflow */ 310 if (htab->n_buckets == 0 || 311 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 312 goto free_htab; 313 314 cost = (u64) htab->n_buckets * sizeof(struct bucket) + 315 (u64) htab->elem_size * htab->map.max_entries; 316 317 if (percpu) 318 cost += (u64) round_up(htab->map.value_size, 8) * 319 num_possible_cpus() * htab->map.max_entries; 320 else 321 cost += (u64) htab->elem_size * num_possible_cpus(); 322 323 if (cost >= U32_MAX - PAGE_SIZE) 324 /* make sure page count doesn't overflow */ 325 goto free_htab; 326 327 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 328 329 /* if map size is larger than memlock limit, reject it early */ 330 err = bpf_map_precharge_memlock(htab->map.pages); 331 if (err) 332 goto free_htab; 333 334 err = -ENOMEM; 335 htab->buckets = bpf_map_area_alloc(htab->n_buckets * 336 sizeof(struct bucket)); 337 if (!htab->buckets) 338 goto free_htab; 339 340 for (i = 0; i < htab->n_buckets; i++) { 341 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); 342 raw_spin_lock_init(&htab->buckets[i].lock); 343 } 344 345 if (!percpu && !lru) { 346 /* lru itself can remove the least used element, so 347 * there is no need for an extra elem during map_update. 348 */ 349 err = alloc_extra_elems(htab); 350 if (err) 351 goto free_buckets; 352 } 353 354 if (prealloc) { 355 err = prealloc_init(htab); 356 if (err) 357 goto free_extra_elems; 358 } 359 360 return &htab->map; 361 362 free_extra_elems: 363 free_percpu(htab->extra_elems); 364 free_buckets: 365 bpf_map_area_free(htab->buckets); 366 free_htab: 367 kfree(htab); 368 return ERR_PTR(err); 369 } 370 371 static inline u32 htab_map_hash(const void *key, u32 key_len) 372 { 373 return jhash(key, key_len, 0); 374 } 375 376 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) 377 { 378 return &htab->buckets[hash & (htab->n_buckets - 1)]; 379 } 380 381 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) 382 { 383 return &__select_bucket(htab, hash)->head; 384 } 385 386 /* this lookup function can only be called with bucket lock taken */ 387 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, 388 void *key, u32 key_size) 389 { 390 struct hlist_nulls_node *n; 391 struct htab_elem *l; 392 393 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 394 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 395 return l; 396 397 return NULL; 398 } 399 400 /* can be called without bucket lock. it will repeat the loop in 401 * the unlikely event when elements moved from one bucket into another 402 * while link list is being walked 403 */ 404 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, 405 u32 hash, void *key, 406 u32 key_size, u32 n_buckets) 407 { 408 struct hlist_nulls_node *n; 409 struct htab_elem *l; 410 411 again: 412 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 413 if (l->hash == hash && !memcmp(&l->key, key, key_size)) 414 return l; 415 416 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) 417 goto again; 418 419 return NULL; 420 } 421 422 /* Called from syscall or from eBPF program */ 423 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) 424 { 425 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 426 struct hlist_nulls_head *head; 427 struct htab_elem *l; 428 u32 hash, key_size; 429 430 /* Must be called with rcu_read_lock. */ 431 WARN_ON_ONCE(!rcu_read_lock_held()); 432 433 key_size = map->key_size; 434 435 hash = htab_map_hash(key, key_size); 436 437 head = select_bucket(htab, hash); 438 439 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 440 441 return l; 442 } 443 444 static void *htab_map_lookup_elem(struct bpf_map *map, void *key) 445 { 446 struct htab_elem *l = __htab_map_lookup_elem(map, key); 447 448 if (l) 449 return l->key + round_up(map->key_size, 8); 450 451 return NULL; 452 } 453 454 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 455 { 456 struct htab_elem *l = __htab_map_lookup_elem(map, key); 457 458 if (l) { 459 bpf_lru_node_set_ref(&l->lru_node); 460 return l->key + round_up(map->key_size, 8); 461 } 462 463 return NULL; 464 } 465 466 /* It is called from the bpf_lru_list when the LRU needs to delete 467 * older elements from the htab. 468 */ 469 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) 470 { 471 struct bpf_htab *htab = (struct bpf_htab *)arg; 472 struct htab_elem *l = NULL, *tgt_l; 473 struct hlist_nulls_head *head; 474 struct hlist_nulls_node *n; 475 unsigned long flags; 476 struct bucket *b; 477 478 tgt_l = container_of(node, struct htab_elem, lru_node); 479 b = __select_bucket(htab, tgt_l->hash); 480 head = &b->head; 481 482 raw_spin_lock_irqsave(&b->lock, flags); 483 484 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) 485 if (l == tgt_l) { 486 hlist_nulls_del_rcu(&l->hash_node); 487 break; 488 } 489 490 raw_spin_unlock_irqrestore(&b->lock, flags); 491 492 return l == tgt_l; 493 } 494 495 /* Called from syscall */ 496 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 497 { 498 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 499 struct hlist_nulls_head *head; 500 struct htab_elem *l, *next_l; 501 u32 hash, key_size; 502 int i; 503 504 WARN_ON_ONCE(!rcu_read_lock_held()); 505 506 key_size = map->key_size; 507 508 hash = htab_map_hash(key, key_size); 509 510 head = select_bucket(htab, hash); 511 512 /* lookup the key */ 513 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); 514 515 if (!l) { 516 i = 0; 517 goto find_first_elem; 518 } 519 520 /* key was found, get next key in the same bucket */ 521 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), 522 struct htab_elem, hash_node); 523 524 if (next_l) { 525 /* if next elem in this hash list is non-zero, just return it */ 526 memcpy(next_key, next_l->key, key_size); 527 return 0; 528 } 529 530 /* no more elements in this hash list, go to the next bucket */ 531 i = hash & (htab->n_buckets - 1); 532 i++; 533 534 find_first_elem: 535 /* iterate over buckets */ 536 for (; i < htab->n_buckets; i++) { 537 head = select_bucket(htab, i); 538 539 /* pick first element in the bucket */ 540 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), 541 struct htab_elem, hash_node); 542 if (next_l) { 543 /* if it's not empty, just return it */ 544 memcpy(next_key, next_l->key, key_size); 545 return 0; 546 } 547 } 548 549 /* iterated over all buckets and all elements */ 550 return -ENOENT; 551 } 552 553 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) 554 { 555 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 556 free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); 557 kfree(l); 558 } 559 560 static void htab_elem_free_rcu(struct rcu_head *head) 561 { 562 struct htab_elem *l = container_of(head, struct htab_elem, rcu); 563 struct bpf_htab *htab = l->htab; 564 565 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while 566 * we're calling kfree, otherwise deadlock is possible if kprobes 567 * are placed somewhere inside of slub 568 */ 569 preempt_disable(); 570 __this_cpu_inc(bpf_prog_active); 571 htab_elem_free(htab, l); 572 __this_cpu_dec(bpf_prog_active); 573 preempt_enable(); 574 } 575 576 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) 577 { 578 if (l->state == HTAB_EXTRA_ELEM_USED) { 579 l->state = HTAB_EXTRA_ELEM_FREE; 580 return; 581 } 582 583 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { 584 pcpu_freelist_push(&htab->freelist, &l->fnode); 585 } else { 586 atomic_dec(&htab->count); 587 l->htab = htab; 588 call_rcu(&l->rcu, htab_elem_free_rcu); 589 } 590 } 591 592 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, 593 void *value, bool onallcpus) 594 { 595 if (!onallcpus) { 596 /* copy true value_size bytes */ 597 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); 598 } else { 599 u32 size = round_up(htab->map.value_size, 8); 600 int off = 0, cpu; 601 602 for_each_possible_cpu(cpu) { 603 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), 604 value + off, size); 605 off += size; 606 } 607 } 608 } 609 610 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, 611 void *value, u32 key_size, u32 hash, 612 bool percpu, bool onallcpus, 613 bool old_elem_exists) 614 { 615 u32 size = htab->map.value_size; 616 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); 617 struct htab_elem *l_new; 618 void __percpu *pptr; 619 int err = 0; 620 621 if (prealloc) { 622 struct pcpu_freelist_node *l; 623 624 l = pcpu_freelist_pop(&htab->freelist); 625 if (!l) 626 err = -E2BIG; 627 else 628 l_new = container_of(l, struct htab_elem, fnode); 629 } else { 630 if (atomic_inc_return(&htab->count) > htab->map.max_entries) { 631 atomic_dec(&htab->count); 632 err = -E2BIG; 633 } else { 634 l_new = kmalloc(htab->elem_size, 635 GFP_ATOMIC | __GFP_NOWARN); 636 if (!l_new) 637 return ERR_PTR(-ENOMEM); 638 } 639 } 640 641 if (err) { 642 if (!old_elem_exists) 643 return ERR_PTR(err); 644 645 /* if we're updating the existing element and the hash table 646 * is full, use per-cpu extra elems 647 */ 648 l_new = this_cpu_ptr(htab->extra_elems); 649 if (l_new->state != HTAB_EXTRA_ELEM_FREE) 650 return ERR_PTR(-E2BIG); 651 l_new->state = HTAB_EXTRA_ELEM_USED; 652 } else { 653 l_new->state = HTAB_NOT_AN_EXTRA_ELEM; 654 } 655 656 memcpy(l_new->key, key, key_size); 657 if (percpu) { 658 /* round up value_size to 8 bytes */ 659 size = round_up(size, 8); 660 661 if (prealloc) { 662 pptr = htab_elem_get_ptr(l_new, key_size); 663 } else { 664 /* alloc_percpu zero-fills */ 665 pptr = __alloc_percpu_gfp(size, 8, 666 GFP_ATOMIC | __GFP_NOWARN); 667 if (!pptr) { 668 kfree(l_new); 669 return ERR_PTR(-ENOMEM); 670 } 671 } 672 673 pcpu_copy_value(htab, pptr, value, onallcpus); 674 675 if (!prealloc) 676 htab_elem_set_ptr(l_new, key_size, pptr); 677 } else { 678 memcpy(l_new->key + round_up(key_size, 8), value, size); 679 } 680 681 l_new->hash = hash; 682 return l_new; 683 } 684 685 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, 686 u64 map_flags) 687 { 688 if (l_old && map_flags == BPF_NOEXIST) 689 /* elem already exists */ 690 return -EEXIST; 691 692 if (!l_old && map_flags == BPF_EXIST) 693 /* elem doesn't exist, cannot update it */ 694 return -ENOENT; 695 696 return 0; 697 } 698 699 /* Called from syscall or from eBPF program */ 700 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, 701 u64 map_flags) 702 { 703 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 704 struct htab_elem *l_new = NULL, *l_old; 705 struct hlist_nulls_head *head; 706 unsigned long flags; 707 struct bucket *b; 708 u32 key_size, hash; 709 int ret; 710 711 if (unlikely(map_flags > BPF_EXIST)) 712 /* unknown flags */ 713 return -EINVAL; 714 715 WARN_ON_ONCE(!rcu_read_lock_held()); 716 717 key_size = map->key_size; 718 719 hash = htab_map_hash(key, key_size); 720 721 b = __select_bucket(htab, hash); 722 head = &b->head; 723 724 /* bpf_map_update_elem() can be called in_irq() */ 725 raw_spin_lock_irqsave(&b->lock, flags); 726 727 l_old = lookup_elem_raw(head, hash, key, key_size); 728 729 ret = check_flags(htab, l_old, map_flags); 730 if (ret) 731 goto err; 732 733 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, 734 !!l_old); 735 if (IS_ERR(l_new)) { 736 /* all pre-allocated elements are in use or memory exhausted */ 737 ret = PTR_ERR(l_new); 738 goto err; 739 } 740 741 /* add new element to the head of the list, so that 742 * concurrent search will find it before old elem 743 */ 744 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 745 if (l_old) { 746 hlist_nulls_del_rcu(&l_old->hash_node); 747 free_htab_elem(htab, l_old); 748 } 749 ret = 0; 750 err: 751 raw_spin_unlock_irqrestore(&b->lock, flags); 752 return ret; 753 } 754 755 static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, 756 u64 map_flags) 757 { 758 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 759 struct htab_elem *l_new, *l_old = NULL; 760 struct hlist_nulls_head *head; 761 unsigned long flags; 762 struct bucket *b; 763 u32 key_size, hash; 764 int ret; 765 766 if (unlikely(map_flags > BPF_EXIST)) 767 /* unknown flags */ 768 return -EINVAL; 769 770 WARN_ON_ONCE(!rcu_read_lock_held()); 771 772 key_size = map->key_size; 773 774 hash = htab_map_hash(key, key_size); 775 776 b = __select_bucket(htab, hash); 777 head = &b->head; 778 779 /* For LRU, we need to alloc before taking bucket's 780 * spinlock because getting free nodes from LRU may need 781 * to remove older elements from htab and this removal 782 * operation will need a bucket lock. 783 */ 784 l_new = prealloc_lru_pop(htab, key, hash); 785 if (!l_new) 786 return -ENOMEM; 787 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); 788 789 /* bpf_map_update_elem() can be called in_irq() */ 790 raw_spin_lock_irqsave(&b->lock, flags); 791 792 l_old = lookup_elem_raw(head, hash, key, key_size); 793 794 ret = check_flags(htab, l_old, map_flags); 795 if (ret) 796 goto err; 797 798 /* add new element to the head of the list, so that 799 * concurrent search will find it before old elem 800 */ 801 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 802 if (l_old) { 803 bpf_lru_node_set_ref(&l_new->lru_node); 804 hlist_nulls_del_rcu(&l_old->hash_node); 805 } 806 ret = 0; 807 808 err: 809 raw_spin_unlock_irqrestore(&b->lock, flags); 810 811 if (ret) 812 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 813 else if (l_old) 814 bpf_lru_push_free(&htab->lru, &l_old->lru_node); 815 816 return ret; 817 } 818 819 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, 820 void *value, u64 map_flags, 821 bool onallcpus) 822 { 823 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 824 struct htab_elem *l_new = NULL, *l_old; 825 struct hlist_nulls_head *head; 826 unsigned long flags; 827 struct bucket *b; 828 u32 key_size, hash; 829 int ret; 830 831 if (unlikely(map_flags > BPF_EXIST)) 832 /* unknown flags */ 833 return -EINVAL; 834 835 WARN_ON_ONCE(!rcu_read_lock_held()); 836 837 key_size = map->key_size; 838 839 hash = htab_map_hash(key, key_size); 840 841 b = __select_bucket(htab, hash); 842 head = &b->head; 843 844 /* bpf_map_update_elem() can be called in_irq() */ 845 raw_spin_lock_irqsave(&b->lock, flags); 846 847 l_old = lookup_elem_raw(head, hash, key, key_size); 848 849 ret = check_flags(htab, l_old, map_flags); 850 if (ret) 851 goto err; 852 853 if (l_old) { 854 /* per-cpu hash map can update value in-place */ 855 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 856 value, onallcpus); 857 } else { 858 l_new = alloc_htab_elem(htab, key, value, key_size, 859 hash, true, onallcpus, false); 860 if (IS_ERR(l_new)) { 861 ret = PTR_ERR(l_new); 862 goto err; 863 } 864 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 865 } 866 ret = 0; 867 err: 868 raw_spin_unlock_irqrestore(&b->lock, flags); 869 return ret; 870 } 871 872 static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 873 void *value, u64 map_flags, 874 bool onallcpus) 875 { 876 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 877 struct htab_elem *l_new = NULL, *l_old; 878 struct hlist_nulls_head *head; 879 unsigned long flags; 880 struct bucket *b; 881 u32 key_size, hash; 882 int ret; 883 884 if (unlikely(map_flags > BPF_EXIST)) 885 /* unknown flags */ 886 return -EINVAL; 887 888 WARN_ON_ONCE(!rcu_read_lock_held()); 889 890 key_size = map->key_size; 891 892 hash = htab_map_hash(key, key_size); 893 894 b = __select_bucket(htab, hash); 895 head = &b->head; 896 897 /* For LRU, we need to alloc before taking bucket's 898 * spinlock because LRU's elem alloc may need 899 * to remove older elem from htab and this removal 900 * operation will need a bucket lock. 901 */ 902 if (map_flags != BPF_EXIST) { 903 l_new = prealloc_lru_pop(htab, key, hash); 904 if (!l_new) 905 return -ENOMEM; 906 } 907 908 /* bpf_map_update_elem() can be called in_irq() */ 909 raw_spin_lock_irqsave(&b->lock, flags); 910 911 l_old = lookup_elem_raw(head, hash, key, key_size); 912 913 ret = check_flags(htab, l_old, map_flags); 914 if (ret) 915 goto err; 916 917 if (l_old) { 918 bpf_lru_node_set_ref(&l_old->lru_node); 919 920 /* per-cpu hash map can update value in-place */ 921 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), 922 value, onallcpus); 923 } else { 924 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), 925 value, onallcpus); 926 hlist_nulls_add_head_rcu(&l_new->hash_node, head); 927 l_new = NULL; 928 } 929 ret = 0; 930 err: 931 raw_spin_unlock_irqrestore(&b->lock, flags); 932 if (l_new) 933 bpf_lru_push_free(&htab->lru, &l_new->lru_node); 934 return ret; 935 } 936 937 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, 938 void *value, u64 map_flags) 939 { 940 return __htab_percpu_map_update_elem(map, key, value, map_flags, false); 941 } 942 943 static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, 944 void *value, u64 map_flags) 945 { 946 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, 947 false); 948 } 949 950 /* Called from syscall or from eBPF program */ 951 static int htab_map_delete_elem(struct bpf_map *map, void *key) 952 { 953 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 954 struct hlist_nulls_head *head; 955 struct bucket *b; 956 struct htab_elem *l; 957 unsigned long flags; 958 u32 hash, key_size; 959 int ret = -ENOENT; 960 961 WARN_ON_ONCE(!rcu_read_lock_held()); 962 963 key_size = map->key_size; 964 965 hash = htab_map_hash(key, key_size); 966 b = __select_bucket(htab, hash); 967 head = &b->head; 968 969 raw_spin_lock_irqsave(&b->lock, flags); 970 971 l = lookup_elem_raw(head, hash, key, key_size); 972 973 if (l) { 974 hlist_nulls_del_rcu(&l->hash_node); 975 free_htab_elem(htab, l); 976 ret = 0; 977 } 978 979 raw_spin_unlock_irqrestore(&b->lock, flags); 980 return ret; 981 } 982 983 static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) 984 { 985 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 986 struct hlist_nulls_head *head; 987 struct bucket *b; 988 struct htab_elem *l; 989 unsigned long flags; 990 u32 hash, key_size; 991 int ret = -ENOENT; 992 993 WARN_ON_ONCE(!rcu_read_lock_held()); 994 995 key_size = map->key_size; 996 997 hash = htab_map_hash(key, key_size); 998 b = __select_bucket(htab, hash); 999 head = &b->head; 1000 1001 raw_spin_lock_irqsave(&b->lock, flags); 1002 1003 l = lookup_elem_raw(head, hash, key, key_size); 1004 1005 if (l) { 1006 hlist_nulls_del_rcu(&l->hash_node); 1007 ret = 0; 1008 } 1009 1010 raw_spin_unlock_irqrestore(&b->lock, flags); 1011 if (l) 1012 bpf_lru_push_free(&htab->lru, &l->lru_node); 1013 return ret; 1014 } 1015 1016 static void delete_all_elements(struct bpf_htab *htab) 1017 { 1018 int i; 1019 1020 for (i = 0; i < htab->n_buckets; i++) { 1021 struct hlist_nulls_head *head = select_bucket(htab, i); 1022 struct hlist_nulls_node *n; 1023 struct htab_elem *l; 1024 1025 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { 1026 hlist_nulls_del_rcu(&l->hash_node); 1027 if (l->state != HTAB_EXTRA_ELEM_USED) 1028 htab_elem_free(htab, l); 1029 } 1030 } 1031 } 1032 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 1033 static void htab_map_free(struct bpf_map *map) 1034 { 1035 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1036 1037 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 1038 * so the programs (can be more than one that used this map) were 1039 * disconnected from events. Wait for outstanding critical sections in 1040 * these programs to complete 1041 */ 1042 synchronize_rcu(); 1043 1044 /* some of free_htab_elem() callbacks for elements of this map may 1045 * not have executed. Wait for them. 1046 */ 1047 rcu_barrier(); 1048 if (htab->map.map_flags & BPF_F_NO_PREALLOC) 1049 delete_all_elements(htab); 1050 else 1051 prealloc_destroy(htab); 1052 1053 free_percpu(htab->extra_elems); 1054 bpf_map_area_free(htab->buckets); 1055 kfree(htab); 1056 } 1057 1058 static const struct bpf_map_ops htab_ops = { 1059 .map_alloc = htab_map_alloc, 1060 .map_free = htab_map_free, 1061 .map_get_next_key = htab_map_get_next_key, 1062 .map_lookup_elem = htab_map_lookup_elem, 1063 .map_update_elem = htab_map_update_elem, 1064 .map_delete_elem = htab_map_delete_elem, 1065 }; 1066 1067 static struct bpf_map_type_list htab_type __ro_after_init = { 1068 .ops = &htab_ops, 1069 .type = BPF_MAP_TYPE_HASH, 1070 }; 1071 1072 static const struct bpf_map_ops htab_lru_ops = { 1073 .map_alloc = htab_map_alloc, 1074 .map_free = htab_map_free, 1075 .map_get_next_key = htab_map_get_next_key, 1076 .map_lookup_elem = htab_lru_map_lookup_elem, 1077 .map_update_elem = htab_lru_map_update_elem, 1078 .map_delete_elem = htab_lru_map_delete_elem, 1079 }; 1080 1081 static struct bpf_map_type_list htab_lru_type __ro_after_init = { 1082 .ops = &htab_lru_ops, 1083 .type = BPF_MAP_TYPE_LRU_HASH, 1084 }; 1085 1086 /* Called from eBPF program */ 1087 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1088 { 1089 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1090 1091 if (l) 1092 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1093 else 1094 return NULL; 1095 } 1096 1097 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) 1098 { 1099 struct htab_elem *l = __htab_map_lookup_elem(map, key); 1100 1101 if (l) { 1102 bpf_lru_node_set_ref(&l->lru_node); 1103 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); 1104 } 1105 1106 return NULL; 1107 } 1108 1109 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1110 { 1111 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1112 struct htab_elem *l; 1113 void __percpu *pptr; 1114 int ret = -ENOENT; 1115 int cpu, off = 0; 1116 u32 size; 1117 1118 /* per_cpu areas are zero-filled and bpf programs can only 1119 * access 'value_size' of them, so copying rounded areas 1120 * will not leak any kernel data 1121 */ 1122 size = round_up(map->value_size, 8); 1123 rcu_read_lock(); 1124 l = __htab_map_lookup_elem(map, key); 1125 if (!l) 1126 goto out; 1127 if (htab_is_lru(htab)) 1128 bpf_lru_node_set_ref(&l->lru_node); 1129 pptr = htab_elem_get_ptr(l, map->key_size); 1130 for_each_possible_cpu(cpu) { 1131 bpf_long_memcpy(value + off, 1132 per_cpu_ptr(pptr, cpu), size); 1133 off += size; 1134 } 1135 ret = 0; 1136 out: 1137 rcu_read_unlock(); 1138 return ret; 1139 } 1140 1141 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, 1142 u64 map_flags) 1143 { 1144 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1145 int ret; 1146 1147 rcu_read_lock(); 1148 if (htab_is_lru(htab)) 1149 ret = __htab_lru_percpu_map_update_elem(map, key, value, 1150 map_flags, true); 1151 else 1152 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, 1153 true); 1154 rcu_read_unlock(); 1155 1156 return ret; 1157 } 1158 1159 static const struct bpf_map_ops htab_percpu_ops = { 1160 .map_alloc = htab_map_alloc, 1161 .map_free = htab_map_free, 1162 .map_get_next_key = htab_map_get_next_key, 1163 .map_lookup_elem = htab_percpu_map_lookup_elem, 1164 .map_update_elem = htab_percpu_map_update_elem, 1165 .map_delete_elem = htab_map_delete_elem, 1166 }; 1167 1168 static struct bpf_map_type_list htab_percpu_type __ro_after_init = { 1169 .ops = &htab_percpu_ops, 1170 .type = BPF_MAP_TYPE_PERCPU_HASH, 1171 }; 1172 1173 static const struct bpf_map_ops htab_lru_percpu_ops = { 1174 .map_alloc = htab_map_alloc, 1175 .map_free = htab_map_free, 1176 .map_get_next_key = htab_map_get_next_key, 1177 .map_lookup_elem = htab_lru_percpu_map_lookup_elem, 1178 .map_update_elem = htab_lru_percpu_map_update_elem, 1179 .map_delete_elem = htab_lru_map_delete_elem, 1180 }; 1181 1182 static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { 1183 .ops = &htab_lru_percpu_ops, 1184 .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, 1185 }; 1186 1187 static int __init register_htab_map(void) 1188 { 1189 bpf_register_map_type(&htab_type); 1190 bpf_register_map_type(&htab_percpu_type); 1191 bpf_register_map_type(&htab_lru_type); 1192 bpf_register_map_type(&htab_lru_percpu_type); 1193 return 0; 1194 } 1195 late_initcall(register_htab_map); 1196