1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016,2017 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/err.h> 8 #include <linux/slab.h> 9 #include <linux/mm.h> 10 #include <linux/filter.h> 11 #include <linux/perf_event.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 15 #include "map_in_map.h" 16 17 #define ARRAY_CREATE_FLAG_MASK \ 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) 19 20 static void bpf_array_free_percpu(struct bpf_array *array) 21 { 22 int i; 23 24 for (i = 0; i < array->map.max_entries; i++) { 25 free_percpu(array->pptrs[i]); 26 cond_resched(); 27 } 28 } 29 30 static int bpf_array_alloc_percpu(struct bpf_array *array) 31 { 32 void __percpu *ptr; 33 int i; 34 35 for (i = 0; i < array->map.max_entries; i++) { 36 ptr = __alloc_percpu_gfp(array->elem_size, 8, 37 GFP_USER | __GFP_NOWARN); 38 if (!ptr) { 39 bpf_array_free_percpu(array); 40 return -ENOMEM; 41 } 42 array->pptrs[i] = ptr; 43 cond_resched(); 44 } 45 46 return 0; 47 } 48 49 /* Called from syscall */ 50 int array_map_alloc_check(union bpf_attr *attr) 51 { 52 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 53 int numa_node = bpf_map_attr_numa_node(attr); 54 55 /* check sanity of attributes */ 56 if (attr->max_entries == 0 || attr->key_size != 4 || 57 attr->value_size == 0 || 58 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 59 !bpf_map_flags_access_ok(attr->map_flags) || 60 (percpu && numa_node != NUMA_NO_NODE)) 61 return -EINVAL; 62 63 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 64 attr->map_flags & BPF_F_MMAPABLE) 65 return -EINVAL; 66 67 if (attr->value_size > KMALLOC_MAX_SIZE) 68 /* if value_size is bigger, the user space won't be able to 69 * access the elements. 70 */ 71 return -E2BIG; 72 73 return 0; 74 } 75 76 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 77 { 78 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 79 int ret, numa_node = bpf_map_attr_numa_node(attr); 80 u32 elem_size, index_mask, max_entries; 81 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 82 u64 cost, array_size, mask64; 83 struct bpf_map_memory mem; 84 struct bpf_array *array; 85 86 elem_size = round_up(attr->value_size, 8); 87 88 max_entries = attr->max_entries; 89 90 /* On 32 bit archs roundup_pow_of_two() with max_entries that has 91 * upper most bit set in u32 space is undefined behavior due to 92 * resulting 1U << 32, so do it manually here in u64 space. 93 */ 94 mask64 = fls_long(max_entries - 1); 95 mask64 = 1ULL << mask64; 96 mask64 -= 1; 97 98 index_mask = mask64; 99 if (!bypass_spec_v1) { 100 /* round up array size to nearest power of 2, 101 * since cpu will speculate within index_mask limits 102 */ 103 max_entries = index_mask + 1; 104 /* Check for overflows. */ 105 if (max_entries < attr->max_entries) 106 return ERR_PTR(-E2BIG); 107 } 108 109 array_size = sizeof(*array); 110 if (percpu) { 111 array_size += (u64) max_entries * sizeof(void *); 112 } else { 113 /* rely on vmalloc() to return page-aligned memory and 114 * ensure array->value is exactly page-aligned 115 */ 116 if (attr->map_flags & BPF_F_MMAPABLE) { 117 array_size = PAGE_ALIGN(array_size); 118 array_size += PAGE_ALIGN((u64) max_entries * elem_size); 119 } else { 120 array_size += (u64) max_entries * elem_size; 121 } 122 } 123 124 /* make sure there is no u32 overflow later in round_up() */ 125 cost = array_size; 126 if (percpu) 127 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 128 129 ret = bpf_map_charge_init(&mem, cost); 130 if (ret < 0) 131 return ERR_PTR(ret); 132 133 /* allocate all map elements and zero-initialize them */ 134 if (attr->map_flags & BPF_F_MMAPABLE) { 135 void *data; 136 137 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 138 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 139 if (!data) { 140 bpf_map_charge_finish(&mem); 141 return ERR_PTR(-ENOMEM); 142 } 143 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 144 - offsetof(struct bpf_array, value); 145 } else { 146 array = bpf_map_area_alloc(array_size, numa_node); 147 } 148 if (!array) { 149 bpf_map_charge_finish(&mem); 150 return ERR_PTR(-ENOMEM); 151 } 152 array->index_mask = index_mask; 153 array->map.bypass_spec_v1 = bypass_spec_v1; 154 155 /* copy mandatory map attributes */ 156 bpf_map_init_from_attr(&array->map, attr); 157 bpf_map_charge_move(&array->map.memory, &mem); 158 array->elem_size = elem_size; 159 160 if (percpu && bpf_array_alloc_percpu(array)) { 161 bpf_map_charge_finish(&array->map.memory); 162 bpf_map_area_free(array); 163 return ERR_PTR(-ENOMEM); 164 } 165 166 return &array->map; 167 } 168 169 /* Called from syscall or from eBPF program */ 170 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 171 { 172 struct bpf_array *array = container_of(map, struct bpf_array, map); 173 u32 index = *(u32 *)key; 174 175 if (unlikely(index >= array->map.max_entries)) 176 return NULL; 177 178 return array->value + array->elem_size * (index & array->index_mask); 179 } 180 181 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, 182 u32 off) 183 { 184 struct bpf_array *array = container_of(map, struct bpf_array, map); 185 186 if (map->max_entries != 1) 187 return -ENOTSUPP; 188 if (off >= map->value_size) 189 return -EINVAL; 190 191 *imm = (unsigned long)array->value; 192 return 0; 193 } 194 195 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, 196 u32 *off) 197 { 198 struct bpf_array *array = container_of(map, struct bpf_array, map); 199 u64 base = (unsigned long)array->value; 200 u64 range = array->elem_size; 201 202 if (map->max_entries != 1) 203 return -ENOTSUPP; 204 if (imm < base || imm >= base + range) 205 return -ENOENT; 206 207 *off = imm - base; 208 return 0; 209 } 210 211 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 212 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 213 { 214 struct bpf_array *array = container_of(map, struct bpf_array, map); 215 struct bpf_insn *insn = insn_buf; 216 u32 elem_size = round_up(map->value_size, 8); 217 const int ret = BPF_REG_0; 218 const int map_ptr = BPF_REG_1; 219 const int index = BPF_REG_2; 220 221 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 222 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 223 if (!map->bypass_spec_v1) { 224 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); 225 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 226 } else { 227 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 228 } 229 230 if (is_power_of_2(elem_size)) { 231 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 232 } else { 233 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 234 } 235 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 236 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 237 *insn++ = BPF_MOV64_IMM(ret, 0); 238 return insn - insn_buf; 239 } 240 241 /* Called from eBPF program */ 242 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 243 { 244 struct bpf_array *array = container_of(map, struct bpf_array, map); 245 u32 index = *(u32 *)key; 246 247 if (unlikely(index >= array->map.max_entries)) 248 return NULL; 249 250 return this_cpu_ptr(array->pptrs[index & array->index_mask]); 251 } 252 253 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 254 { 255 struct bpf_array *array = container_of(map, struct bpf_array, map); 256 u32 index = *(u32 *)key; 257 void __percpu *pptr; 258 int cpu, off = 0; 259 u32 size; 260 261 if (unlikely(index >= array->map.max_entries)) 262 return -ENOENT; 263 264 /* per_cpu areas are zero-filled and bpf programs can only 265 * access 'value_size' of them, so copying rounded areas 266 * will not leak any kernel data 267 */ 268 size = round_up(map->value_size, 8); 269 rcu_read_lock(); 270 pptr = array->pptrs[index & array->index_mask]; 271 for_each_possible_cpu(cpu) { 272 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 273 off += size; 274 } 275 rcu_read_unlock(); 276 return 0; 277 } 278 279 /* Called from syscall */ 280 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 281 { 282 struct bpf_array *array = container_of(map, struct bpf_array, map); 283 u32 index = key ? *(u32 *)key : U32_MAX; 284 u32 *next = (u32 *)next_key; 285 286 if (index >= array->map.max_entries) { 287 *next = 0; 288 return 0; 289 } 290 291 if (index == array->map.max_entries - 1) 292 return -ENOENT; 293 294 *next = index + 1; 295 return 0; 296 } 297 298 /* Called from syscall or from eBPF program */ 299 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 300 u64 map_flags) 301 { 302 struct bpf_array *array = container_of(map, struct bpf_array, map); 303 u32 index = *(u32 *)key; 304 char *val; 305 306 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 307 /* unknown flags */ 308 return -EINVAL; 309 310 if (unlikely(index >= array->map.max_entries)) 311 /* all elements were pre-allocated, cannot insert a new one */ 312 return -E2BIG; 313 314 if (unlikely(map_flags & BPF_NOEXIST)) 315 /* all elements already exist */ 316 return -EEXIST; 317 318 if (unlikely((map_flags & BPF_F_LOCK) && 319 !map_value_has_spin_lock(map))) 320 return -EINVAL; 321 322 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 323 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), 324 value, map->value_size); 325 } else { 326 val = array->value + 327 array->elem_size * (index & array->index_mask); 328 if (map_flags & BPF_F_LOCK) 329 copy_map_value_locked(map, val, value, false); 330 else 331 copy_map_value(map, val, value); 332 } 333 return 0; 334 } 335 336 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 337 u64 map_flags) 338 { 339 struct bpf_array *array = container_of(map, struct bpf_array, map); 340 u32 index = *(u32 *)key; 341 void __percpu *pptr; 342 int cpu, off = 0; 343 u32 size; 344 345 if (unlikely(map_flags > BPF_EXIST)) 346 /* unknown flags */ 347 return -EINVAL; 348 349 if (unlikely(index >= array->map.max_entries)) 350 /* all elements were pre-allocated, cannot insert a new one */ 351 return -E2BIG; 352 353 if (unlikely(map_flags == BPF_NOEXIST)) 354 /* all elements already exist */ 355 return -EEXIST; 356 357 /* the user space will provide round_up(value_size, 8) bytes that 358 * will be copied into per-cpu area. bpf programs can only access 359 * value_size of it. During lookup the same extra bytes will be 360 * returned or zeros which were zero-filled by percpu_alloc, 361 * so no kernel data leaks possible 362 */ 363 size = round_up(map->value_size, 8); 364 rcu_read_lock(); 365 pptr = array->pptrs[index & array->index_mask]; 366 for_each_possible_cpu(cpu) { 367 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 368 off += size; 369 } 370 rcu_read_unlock(); 371 return 0; 372 } 373 374 /* Called from syscall or from eBPF program */ 375 static int array_map_delete_elem(struct bpf_map *map, void *key) 376 { 377 return -EINVAL; 378 } 379 380 static void *array_map_vmalloc_addr(struct bpf_array *array) 381 { 382 return (void *)round_down((unsigned long)array, PAGE_SIZE); 383 } 384 385 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 386 static void array_map_free(struct bpf_map *map) 387 { 388 struct bpf_array *array = container_of(map, struct bpf_array, map); 389 390 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 391 bpf_array_free_percpu(array); 392 393 if (array->map.map_flags & BPF_F_MMAPABLE) 394 bpf_map_area_free(array_map_vmalloc_addr(array)); 395 else 396 bpf_map_area_free(array); 397 } 398 399 static void array_map_seq_show_elem(struct bpf_map *map, void *key, 400 struct seq_file *m) 401 { 402 void *value; 403 404 rcu_read_lock(); 405 406 value = array_map_lookup_elem(map, key); 407 if (!value) { 408 rcu_read_unlock(); 409 return; 410 } 411 412 if (map->btf_key_type_id) 413 seq_printf(m, "%u: ", *(u32 *)key); 414 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 415 seq_puts(m, "\n"); 416 417 rcu_read_unlock(); 418 } 419 420 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, 421 struct seq_file *m) 422 { 423 struct bpf_array *array = container_of(map, struct bpf_array, map); 424 u32 index = *(u32 *)key; 425 void __percpu *pptr; 426 int cpu; 427 428 rcu_read_lock(); 429 430 seq_printf(m, "%u: {\n", *(u32 *)key); 431 pptr = array->pptrs[index & array->index_mask]; 432 for_each_possible_cpu(cpu) { 433 seq_printf(m, "\tcpu%d: ", cpu); 434 btf_type_seq_show(map->btf, map->btf_value_type_id, 435 per_cpu_ptr(pptr, cpu), m); 436 seq_puts(m, "\n"); 437 } 438 seq_puts(m, "}\n"); 439 440 rcu_read_unlock(); 441 } 442 443 static int array_map_check_btf(const struct bpf_map *map, 444 const struct btf *btf, 445 const struct btf_type *key_type, 446 const struct btf_type *value_type) 447 { 448 u32 int_data; 449 450 /* One exception for keyless BTF: .bss/.data/.rodata map */ 451 if (btf_type_is_void(key_type)) { 452 if (map->map_type != BPF_MAP_TYPE_ARRAY || 453 map->max_entries != 1) 454 return -EINVAL; 455 456 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) 457 return -EINVAL; 458 459 return 0; 460 } 461 462 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 463 return -EINVAL; 464 465 int_data = *(u32 *)(key_type + 1); 466 /* bpf array can only take a u32 key. This check makes sure 467 * that the btf matches the attr used during map_create. 468 */ 469 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 470 return -EINVAL; 471 472 return 0; 473 } 474 475 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 476 { 477 struct bpf_array *array = container_of(map, struct bpf_array, map); 478 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; 479 480 if (!(map->map_flags & BPF_F_MMAPABLE)) 481 return -EINVAL; 482 483 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > 484 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) 485 return -EINVAL; 486 487 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), 488 vma->vm_pgoff + pgoff); 489 } 490 491 static bool array_map_meta_equal(const struct bpf_map *meta0, 492 const struct bpf_map *meta1) 493 { 494 return meta0->max_entries == meta1->max_entries && 495 bpf_map_meta_equal(meta0, meta1); 496 } 497 498 struct bpf_iter_seq_array_map_info { 499 struct bpf_map *map; 500 void *percpu_value_buf; 501 u32 index; 502 }; 503 504 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) 505 { 506 struct bpf_iter_seq_array_map_info *info = seq->private; 507 struct bpf_map *map = info->map; 508 struct bpf_array *array; 509 u32 index; 510 511 if (info->index >= map->max_entries) 512 return NULL; 513 514 if (*pos == 0) 515 ++*pos; 516 array = container_of(map, struct bpf_array, map); 517 index = info->index & array->index_mask; 518 if (info->percpu_value_buf) 519 return array->pptrs[index]; 520 return array->value + array->elem_size * index; 521 } 522 523 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 524 { 525 struct bpf_iter_seq_array_map_info *info = seq->private; 526 struct bpf_map *map = info->map; 527 struct bpf_array *array; 528 u32 index; 529 530 ++*pos; 531 ++info->index; 532 if (info->index >= map->max_entries) 533 return NULL; 534 535 array = container_of(map, struct bpf_array, map); 536 index = info->index & array->index_mask; 537 if (info->percpu_value_buf) 538 return array->pptrs[index]; 539 return array->value + array->elem_size * index; 540 } 541 542 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) 543 { 544 struct bpf_iter_seq_array_map_info *info = seq->private; 545 struct bpf_iter__bpf_map_elem ctx = {}; 546 struct bpf_map *map = info->map; 547 struct bpf_iter_meta meta; 548 struct bpf_prog *prog; 549 int off = 0, cpu = 0; 550 void __percpu **pptr; 551 u32 size; 552 553 meta.seq = seq; 554 prog = bpf_iter_get_info(&meta, v == NULL); 555 if (!prog) 556 return 0; 557 558 ctx.meta = &meta; 559 ctx.map = info->map; 560 if (v) { 561 ctx.key = &info->index; 562 563 if (!info->percpu_value_buf) { 564 ctx.value = v; 565 } else { 566 pptr = v; 567 size = round_up(map->value_size, 8); 568 for_each_possible_cpu(cpu) { 569 bpf_long_memcpy(info->percpu_value_buf + off, 570 per_cpu_ptr(pptr, cpu), 571 size); 572 off += size; 573 } 574 ctx.value = info->percpu_value_buf; 575 } 576 } 577 578 return bpf_iter_run_prog(prog, &ctx); 579 } 580 581 static int bpf_array_map_seq_show(struct seq_file *seq, void *v) 582 { 583 return __bpf_array_map_seq_show(seq, v); 584 } 585 586 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) 587 { 588 if (!v) 589 (void)__bpf_array_map_seq_show(seq, NULL); 590 } 591 592 static int bpf_iter_init_array_map(void *priv_data, 593 struct bpf_iter_aux_info *aux) 594 { 595 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 596 struct bpf_map *map = aux->map; 597 void *value_buf; 598 u32 buf_size; 599 600 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 601 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 602 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 603 if (!value_buf) 604 return -ENOMEM; 605 606 seq_info->percpu_value_buf = value_buf; 607 } 608 609 seq_info->map = map; 610 return 0; 611 } 612 613 static void bpf_iter_fini_array_map(void *priv_data) 614 { 615 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 616 617 kfree(seq_info->percpu_value_buf); 618 } 619 620 static const struct seq_operations bpf_array_map_seq_ops = { 621 .start = bpf_array_map_seq_start, 622 .next = bpf_array_map_seq_next, 623 .stop = bpf_array_map_seq_stop, 624 .show = bpf_array_map_seq_show, 625 }; 626 627 static const struct bpf_iter_seq_info iter_seq_info = { 628 .seq_ops = &bpf_array_map_seq_ops, 629 .init_seq_private = bpf_iter_init_array_map, 630 .fini_seq_private = bpf_iter_fini_array_map, 631 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), 632 }; 633 634 static int array_map_btf_id; 635 const struct bpf_map_ops array_map_ops = { 636 .map_meta_equal = array_map_meta_equal, 637 .map_alloc_check = array_map_alloc_check, 638 .map_alloc = array_map_alloc, 639 .map_free = array_map_free, 640 .map_get_next_key = array_map_get_next_key, 641 .map_lookup_elem = array_map_lookup_elem, 642 .map_update_elem = array_map_update_elem, 643 .map_delete_elem = array_map_delete_elem, 644 .map_gen_lookup = array_map_gen_lookup, 645 .map_direct_value_addr = array_map_direct_value_addr, 646 .map_direct_value_meta = array_map_direct_value_meta, 647 .map_mmap = array_map_mmap, 648 .map_seq_show_elem = array_map_seq_show_elem, 649 .map_check_btf = array_map_check_btf, 650 .map_lookup_batch = generic_map_lookup_batch, 651 .map_update_batch = generic_map_update_batch, 652 .map_btf_name = "bpf_array", 653 .map_btf_id = &array_map_btf_id, 654 .iter_seq_info = &iter_seq_info, 655 }; 656 657 static int percpu_array_map_btf_id; 658 const struct bpf_map_ops percpu_array_map_ops = { 659 .map_meta_equal = bpf_map_meta_equal, 660 .map_alloc_check = array_map_alloc_check, 661 .map_alloc = array_map_alloc, 662 .map_free = array_map_free, 663 .map_get_next_key = array_map_get_next_key, 664 .map_lookup_elem = percpu_array_map_lookup_elem, 665 .map_update_elem = array_map_update_elem, 666 .map_delete_elem = array_map_delete_elem, 667 .map_seq_show_elem = percpu_array_map_seq_show_elem, 668 .map_check_btf = array_map_check_btf, 669 .map_btf_name = "bpf_array", 670 .map_btf_id = &percpu_array_map_btf_id, 671 .iter_seq_info = &iter_seq_info, 672 }; 673 674 static int fd_array_map_alloc_check(union bpf_attr *attr) 675 { 676 /* only file descriptors can be stored in this type of map */ 677 if (attr->value_size != sizeof(u32)) 678 return -EINVAL; 679 /* Program read-only/write-only not supported for special maps yet. */ 680 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) 681 return -EINVAL; 682 return array_map_alloc_check(attr); 683 } 684 685 static void fd_array_map_free(struct bpf_map *map) 686 { 687 struct bpf_array *array = container_of(map, struct bpf_array, map); 688 int i; 689 690 /* make sure it's empty */ 691 for (i = 0; i < array->map.max_entries; i++) 692 BUG_ON(array->ptrs[i] != NULL); 693 694 bpf_map_area_free(array); 695 } 696 697 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 698 { 699 return ERR_PTR(-EOPNOTSUPP); 700 } 701 702 /* only called from syscall */ 703 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 704 { 705 void **elem, *ptr; 706 int ret = 0; 707 708 if (!map->ops->map_fd_sys_lookup_elem) 709 return -ENOTSUPP; 710 711 rcu_read_lock(); 712 elem = array_map_lookup_elem(map, key); 713 if (elem && (ptr = READ_ONCE(*elem))) 714 *value = map->ops->map_fd_sys_lookup_elem(ptr); 715 else 716 ret = -ENOENT; 717 rcu_read_unlock(); 718 719 return ret; 720 } 721 722 /* only called from syscall */ 723 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 724 void *key, void *value, u64 map_flags) 725 { 726 struct bpf_array *array = container_of(map, struct bpf_array, map); 727 void *new_ptr, *old_ptr; 728 u32 index = *(u32 *)key, ufd; 729 730 if (map_flags != BPF_ANY) 731 return -EINVAL; 732 733 if (index >= array->map.max_entries) 734 return -E2BIG; 735 736 ufd = *(u32 *)value; 737 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 738 if (IS_ERR(new_ptr)) 739 return PTR_ERR(new_ptr); 740 741 if (map->ops->map_poke_run) { 742 mutex_lock(&array->aux->poke_mutex); 743 old_ptr = xchg(array->ptrs + index, new_ptr); 744 map->ops->map_poke_run(map, index, old_ptr, new_ptr); 745 mutex_unlock(&array->aux->poke_mutex); 746 } else { 747 old_ptr = xchg(array->ptrs + index, new_ptr); 748 } 749 750 if (old_ptr) 751 map->ops->map_fd_put_ptr(old_ptr); 752 return 0; 753 } 754 755 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 756 { 757 struct bpf_array *array = container_of(map, struct bpf_array, map); 758 void *old_ptr; 759 u32 index = *(u32 *)key; 760 761 if (index >= array->map.max_entries) 762 return -E2BIG; 763 764 if (map->ops->map_poke_run) { 765 mutex_lock(&array->aux->poke_mutex); 766 old_ptr = xchg(array->ptrs + index, NULL); 767 map->ops->map_poke_run(map, index, old_ptr, NULL); 768 mutex_unlock(&array->aux->poke_mutex); 769 } else { 770 old_ptr = xchg(array->ptrs + index, NULL); 771 } 772 773 if (old_ptr) { 774 map->ops->map_fd_put_ptr(old_ptr); 775 return 0; 776 } else { 777 return -ENOENT; 778 } 779 } 780 781 static void *prog_fd_array_get_ptr(struct bpf_map *map, 782 struct file *map_file, int fd) 783 { 784 struct bpf_array *array = container_of(map, struct bpf_array, map); 785 struct bpf_prog *prog = bpf_prog_get(fd); 786 787 if (IS_ERR(prog)) 788 return prog; 789 790 if (!bpf_prog_array_compatible(array, prog)) { 791 bpf_prog_put(prog); 792 return ERR_PTR(-EINVAL); 793 } 794 795 return prog; 796 } 797 798 static void prog_fd_array_put_ptr(void *ptr) 799 { 800 bpf_prog_put(ptr); 801 } 802 803 static u32 prog_fd_array_sys_lookup_elem(void *ptr) 804 { 805 return ((struct bpf_prog *)ptr)->aux->id; 806 } 807 808 /* decrement refcnt of all bpf_progs that are stored in this map */ 809 static void bpf_fd_array_map_clear(struct bpf_map *map) 810 { 811 struct bpf_array *array = container_of(map, struct bpf_array, map); 812 int i; 813 814 for (i = 0; i < array->map.max_entries; i++) 815 fd_array_map_delete_elem(map, &i); 816 } 817 818 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, 819 struct seq_file *m) 820 { 821 void **elem, *ptr; 822 u32 prog_id; 823 824 rcu_read_lock(); 825 826 elem = array_map_lookup_elem(map, key); 827 if (elem) { 828 ptr = READ_ONCE(*elem); 829 if (ptr) { 830 seq_printf(m, "%u: ", *(u32 *)key); 831 prog_id = prog_fd_array_sys_lookup_elem(ptr); 832 btf_type_seq_show(map->btf, map->btf_value_type_id, 833 &prog_id, m); 834 seq_puts(m, "\n"); 835 } 836 } 837 838 rcu_read_unlock(); 839 } 840 841 struct prog_poke_elem { 842 struct list_head list; 843 struct bpf_prog_aux *aux; 844 }; 845 846 static int prog_array_map_poke_track(struct bpf_map *map, 847 struct bpf_prog_aux *prog_aux) 848 { 849 struct prog_poke_elem *elem; 850 struct bpf_array_aux *aux; 851 int ret = 0; 852 853 aux = container_of(map, struct bpf_array, map)->aux; 854 mutex_lock(&aux->poke_mutex); 855 list_for_each_entry(elem, &aux->poke_progs, list) { 856 if (elem->aux == prog_aux) 857 goto out; 858 } 859 860 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 861 if (!elem) { 862 ret = -ENOMEM; 863 goto out; 864 } 865 866 INIT_LIST_HEAD(&elem->list); 867 /* We must track the program's aux info at this point in time 868 * since the program pointer itself may not be stable yet, see 869 * also comment in prog_array_map_poke_run(). 870 */ 871 elem->aux = prog_aux; 872 873 list_add_tail(&elem->list, &aux->poke_progs); 874 out: 875 mutex_unlock(&aux->poke_mutex); 876 return ret; 877 } 878 879 static void prog_array_map_poke_untrack(struct bpf_map *map, 880 struct bpf_prog_aux *prog_aux) 881 { 882 struct prog_poke_elem *elem, *tmp; 883 struct bpf_array_aux *aux; 884 885 aux = container_of(map, struct bpf_array, map)->aux; 886 mutex_lock(&aux->poke_mutex); 887 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 888 if (elem->aux == prog_aux) { 889 list_del_init(&elem->list); 890 kfree(elem); 891 break; 892 } 893 } 894 mutex_unlock(&aux->poke_mutex); 895 } 896 897 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 898 struct bpf_prog *old, 899 struct bpf_prog *new) 900 { 901 struct prog_poke_elem *elem; 902 struct bpf_array_aux *aux; 903 904 aux = container_of(map, struct bpf_array, map)->aux; 905 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); 906 907 list_for_each_entry(elem, &aux->poke_progs, list) { 908 struct bpf_jit_poke_descriptor *poke; 909 int i, ret; 910 911 for (i = 0; i < elem->aux->size_poke_tab; i++) { 912 poke = &elem->aux->poke_tab[i]; 913 914 /* Few things to be aware of: 915 * 916 * 1) We can only ever access aux in this context, but 917 * not aux->prog since it might not be stable yet and 918 * there could be danger of use after free otherwise. 919 * 2) Initially when we start tracking aux, the program 920 * is not JITed yet and also does not have a kallsyms 921 * entry. We skip these as poke->ip_stable is not 922 * active yet. The JIT will do the final fixup before 923 * setting it stable. The various poke->ip_stable are 924 * successively activated, so tail call updates can 925 * arrive from here while JIT is still finishing its 926 * final fixup for non-activated poke entries. 927 * 3) On program teardown, the program's kallsym entry gets 928 * removed out of RCU callback, but we can only untrack 929 * from sleepable context, therefore bpf_arch_text_poke() 930 * might not see that this is in BPF text section and 931 * bails out with -EINVAL. As these are unreachable since 932 * RCU grace period already passed, we simply skip them. 933 * 4) Also programs reaching refcount of zero while patching 934 * is in progress is okay since we're protected under 935 * poke_mutex and untrack the programs before the JIT 936 * buffer is freed. When we're still in the middle of 937 * patching and suddenly kallsyms entry of the program 938 * gets evicted, we just skip the rest which is fine due 939 * to point 3). 940 * 5) Any other error happening below from bpf_arch_text_poke() 941 * is a unexpected bug. 942 */ 943 if (!READ_ONCE(poke->ip_stable)) 944 continue; 945 if (poke->reason != BPF_POKE_REASON_TAIL_CALL) 946 continue; 947 if (poke->tail_call.map != map || 948 poke->tail_call.key != key) 949 continue; 950 951 ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, 952 old ? (u8 *)old->bpf_func + 953 poke->adj_off : NULL, 954 new ? (u8 *)new->bpf_func + 955 poke->adj_off : NULL); 956 BUG_ON(ret < 0 && ret != -EINVAL); 957 } 958 } 959 } 960 961 static void prog_array_map_clear_deferred(struct work_struct *work) 962 { 963 struct bpf_map *map = container_of(work, struct bpf_array_aux, 964 work)->map; 965 bpf_fd_array_map_clear(map); 966 bpf_map_put(map); 967 } 968 969 static void prog_array_map_clear(struct bpf_map *map) 970 { 971 struct bpf_array_aux *aux = container_of(map, struct bpf_array, 972 map)->aux; 973 bpf_map_inc(map); 974 schedule_work(&aux->work); 975 } 976 977 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 978 { 979 struct bpf_array_aux *aux; 980 struct bpf_map *map; 981 982 aux = kzalloc(sizeof(*aux), GFP_KERNEL); 983 if (!aux) 984 return ERR_PTR(-ENOMEM); 985 986 INIT_WORK(&aux->work, prog_array_map_clear_deferred); 987 INIT_LIST_HEAD(&aux->poke_progs); 988 mutex_init(&aux->poke_mutex); 989 990 map = array_map_alloc(attr); 991 if (IS_ERR(map)) { 992 kfree(aux); 993 return map; 994 } 995 996 container_of(map, struct bpf_array, map)->aux = aux; 997 aux->map = map; 998 999 return map; 1000 } 1001 1002 static void prog_array_map_free(struct bpf_map *map) 1003 { 1004 struct prog_poke_elem *elem, *tmp; 1005 struct bpf_array_aux *aux; 1006 1007 aux = container_of(map, struct bpf_array, map)->aux; 1008 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 1009 list_del_init(&elem->list); 1010 kfree(elem); 1011 } 1012 kfree(aux); 1013 fd_array_map_free(map); 1014 } 1015 1016 /* prog_array->aux->{type,jited} is a runtime binding. 1017 * Doing static check alone in the verifier is not enough. 1018 * Thus, prog_array_map cannot be used as an inner_map 1019 * and map_meta_equal is not implemented. 1020 */ 1021 static int prog_array_map_btf_id; 1022 const struct bpf_map_ops prog_array_map_ops = { 1023 .map_alloc_check = fd_array_map_alloc_check, 1024 .map_alloc = prog_array_map_alloc, 1025 .map_free = prog_array_map_free, 1026 .map_poke_track = prog_array_map_poke_track, 1027 .map_poke_untrack = prog_array_map_poke_untrack, 1028 .map_poke_run = prog_array_map_poke_run, 1029 .map_get_next_key = array_map_get_next_key, 1030 .map_lookup_elem = fd_array_map_lookup_elem, 1031 .map_delete_elem = fd_array_map_delete_elem, 1032 .map_fd_get_ptr = prog_fd_array_get_ptr, 1033 .map_fd_put_ptr = prog_fd_array_put_ptr, 1034 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 1035 .map_release_uref = prog_array_map_clear, 1036 .map_seq_show_elem = prog_array_map_seq_show_elem, 1037 .map_btf_name = "bpf_array", 1038 .map_btf_id = &prog_array_map_btf_id, 1039 }; 1040 1041 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 1042 struct file *map_file) 1043 { 1044 struct bpf_event_entry *ee; 1045 1046 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 1047 if (ee) { 1048 ee->event = perf_file->private_data; 1049 ee->perf_file = perf_file; 1050 ee->map_file = map_file; 1051 } 1052 1053 return ee; 1054 } 1055 1056 static void __bpf_event_entry_free(struct rcu_head *rcu) 1057 { 1058 struct bpf_event_entry *ee; 1059 1060 ee = container_of(rcu, struct bpf_event_entry, rcu); 1061 fput(ee->perf_file); 1062 kfree(ee); 1063 } 1064 1065 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 1066 { 1067 call_rcu(&ee->rcu, __bpf_event_entry_free); 1068 } 1069 1070 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 1071 struct file *map_file, int fd) 1072 { 1073 struct bpf_event_entry *ee; 1074 struct perf_event *event; 1075 struct file *perf_file; 1076 u64 value; 1077 1078 perf_file = perf_event_get(fd); 1079 if (IS_ERR(perf_file)) 1080 return perf_file; 1081 1082 ee = ERR_PTR(-EOPNOTSUPP); 1083 event = perf_file->private_data; 1084 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) 1085 goto err_out; 1086 1087 ee = bpf_event_entry_gen(perf_file, map_file); 1088 if (ee) 1089 return ee; 1090 ee = ERR_PTR(-ENOMEM); 1091 err_out: 1092 fput(perf_file); 1093 return ee; 1094 } 1095 1096 static void perf_event_fd_array_put_ptr(void *ptr) 1097 { 1098 bpf_event_entry_free_rcu(ptr); 1099 } 1100 1101 static void perf_event_fd_array_release(struct bpf_map *map, 1102 struct file *map_file) 1103 { 1104 struct bpf_array *array = container_of(map, struct bpf_array, map); 1105 struct bpf_event_entry *ee; 1106 int i; 1107 1108 rcu_read_lock(); 1109 for (i = 0; i < array->map.max_entries; i++) { 1110 ee = READ_ONCE(array->ptrs[i]); 1111 if (ee && ee->map_file == map_file) 1112 fd_array_map_delete_elem(map, &i); 1113 } 1114 rcu_read_unlock(); 1115 } 1116 1117 static int perf_event_array_map_btf_id; 1118 const struct bpf_map_ops perf_event_array_map_ops = { 1119 .map_meta_equal = bpf_map_meta_equal, 1120 .map_alloc_check = fd_array_map_alloc_check, 1121 .map_alloc = array_map_alloc, 1122 .map_free = fd_array_map_free, 1123 .map_get_next_key = array_map_get_next_key, 1124 .map_lookup_elem = fd_array_map_lookup_elem, 1125 .map_delete_elem = fd_array_map_delete_elem, 1126 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 1127 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 1128 .map_release = perf_event_fd_array_release, 1129 .map_check_btf = map_check_no_btf, 1130 .map_btf_name = "bpf_array", 1131 .map_btf_id = &perf_event_array_map_btf_id, 1132 }; 1133 1134 #ifdef CONFIG_CGROUPS 1135 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 1136 struct file *map_file /* not used */, 1137 int fd) 1138 { 1139 return cgroup_get_from_fd(fd); 1140 } 1141 1142 static void cgroup_fd_array_put_ptr(void *ptr) 1143 { 1144 /* cgroup_put free cgrp after a rcu grace period */ 1145 cgroup_put(ptr); 1146 } 1147 1148 static void cgroup_fd_array_free(struct bpf_map *map) 1149 { 1150 bpf_fd_array_map_clear(map); 1151 fd_array_map_free(map); 1152 } 1153 1154 static int cgroup_array_map_btf_id; 1155 const struct bpf_map_ops cgroup_array_map_ops = { 1156 .map_meta_equal = bpf_map_meta_equal, 1157 .map_alloc_check = fd_array_map_alloc_check, 1158 .map_alloc = array_map_alloc, 1159 .map_free = cgroup_fd_array_free, 1160 .map_get_next_key = array_map_get_next_key, 1161 .map_lookup_elem = fd_array_map_lookup_elem, 1162 .map_delete_elem = fd_array_map_delete_elem, 1163 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 1164 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 1165 .map_check_btf = map_check_no_btf, 1166 .map_btf_name = "bpf_array", 1167 .map_btf_id = &cgroup_array_map_btf_id, 1168 }; 1169 #endif 1170 1171 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) 1172 { 1173 struct bpf_map *map, *inner_map_meta; 1174 1175 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1176 if (IS_ERR(inner_map_meta)) 1177 return inner_map_meta; 1178 1179 map = array_map_alloc(attr); 1180 if (IS_ERR(map)) { 1181 bpf_map_meta_free(inner_map_meta); 1182 return map; 1183 } 1184 1185 map->inner_map_meta = inner_map_meta; 1186 1187 return map; 1188 } 1189 1190 static void array_of_map_free(struct bpf_map *map) 1191 { 1192 /* map->inner_map_meta is only accessed by syscall which 1193 * is protected by fdget/fdput. 1194 */ 1195 bpf_map_meta_free(map->inner_map_meta); 1196 bpf_fd_array_map_clear(map); 1197 fd_array_map_free(map); 1198 } 1199 1200 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) 1201 { 1202 struct bpf_map **inner_map = array_map_lookup_elem(map, key); 1203 1204 if (!inner_map) 1205 return NULL; 1206 1207 return READ_ONCE(*inner_map); 1208 } 1209 1210 static u32 array_of_map_gen_lookup(struct bpf_map *map, 1211 struct bpf_insn *insn_buf) 1212 { 1213 struct bpf_array *array = container_of(map, struct bpf_array, map); 1214 u32 elem_size = round_up(map->value_size, 8); 1215 struct bpf_insn *insn = insn_buf; 1216 const int ret = BPF_REG_0; 1217 const int map_ptr = BPF_REG_1; 1218 const int index = BPF_REG_2; 1219 1220 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 1221 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 1222 if (!map->bypass_spec_v1) { 1223 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); 1224 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 1225 } else { 1226 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 1227 } 1228 if (is_power_of_2(elem_size)) 1229 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 1230 else 1231 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 1232 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 1233 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1234 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 1235 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 1236 *insn++ = BPF_MOV64_IMM(ret, 0); 1237 1238 return insn - insn_buf; 1239 } 1240 1241 static int array_of_maps_map_btf_id; 1242 const struct bpf_map_ops array_of_maps_map_ops = { 1243 .map_alloc_check = fd_array_map_alloc_check, 1244 .map_alloc = array_of_map_alloc, 1245 .map_free = array_of_map_free, 1246 .map_get_next_key = array_map_get_next_key, 1247 .map_lookup_elem = array_of_map_lookup_elem, 1248 .map_delete_elem = fd_array_map_delete_elem, 1249 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1250 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1251 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1252 .map_gen_lookup = array_of_map_gen_lookup, 1253 .map_check_btf = map_check_no_btf, 1254 .map_btf_name = "bpf_array", 1255 .map_btf_id = &array_of_maps_map_btf_id, 1256 }; 1257