1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 3 * Copyright (c) 2016,2017 Facebook 4 */ 5 #include <linux/bpf.h> 6 #include <linux/btf.h> 7 #include <linux/err.h> 8 #include <linux/slab.h> 9 #include <linux/mm.h> 10 #include <linux/filter.h> 11 #include <linux/perf_event.h> 12 #include <uapi/linux/btf.h> 13 #include <linux/rcupdate_trace.h> 14 15 #include "map_in_map.h" 16 17 #define ARRAY_CREATE_FLAG_MASK \ 18 (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ 19 BPF_F_PRESERVE_ELEMS) 20 21 static void bpf_array_free_percpu(struct bpf_array *array) 22 { 23 int i; 24 25 for (i = 0; i < array->map.max_entries; i++) { 26 free_percpu(array->pptrs[i]); 27 cond_resched(); 28 } 29 } 30 31 static int bpf_array_alloc_percpu(struct bpf_array *array) 32 { 33 void __percpu *ptr; 34 int i; 35 36 for (i = 0; i < array->map.max_entries; i++) { 37 ptr = __alloc_percpu_gfp(array->elem_size, 8, 38 GFP_USER | __GFP_NOWARN); 39 if (!ptr) { 40 bpf_array_free_percpu(array); 41 return -ENOMEM; 42 } 43 array->pptrs[i] = ptr; 44 cond_resched(); 45 } 46 47 return 0; 48 } 49 50 /* Called from syscall */ 51 int array_map_alloc_check(union bpf_attr *attr) 52 { 53 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 54 int numa_node = bpf_map_attr_numa_node(attr); 55 56 /* check sanity of attributes */ 57 if (attr->max_entries == 0 || attr->key_size != 4 || 58 attr->value_size == 0 || 59 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 60 !bpf_map_flags_access_ok(attr->map_flags) || 61 (percpu && numa_node != NUMA_NO_NODE)) 62 return -EINVAL; 63 64 if (attr->map_type != BPF_MAP_TYPE_ARRAY && 65 attr->map_flags & BPF_F_MMAPABLE) 66 return -EINVAL; 67 68 if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && 69 attr->map_flags & BPF_F_PRESERVE_ELEMS) 70 return -EINVAL; 71 72 if (attr->value_size > KMALLOC_MAX_SIZE) 73 /* if value_size is bigger, the user space won't be able to 74 * access the elements. 75 */ 76 return -E2BIG; 77 78 return 0; 79 } 80 81 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 82 { 83 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 84 int ret, numa_node = bpf_map_attr_numa_node(attr); 85 u32 elem_size, index_mask, max_entries; 86 bool bypass_spec_v1 = bpf_bypass_spec_v1(); 87 u64 cost, array_size, mask64; 88 struct bpf_map_memory mem; 89 struct bpf_array *array; 90 91 elem_size = round_up(attr->value_size, 8); 92 93 max_entries = attr->max_entries; 94 95 /* On 32 bit archs roundup_pow_of_two() with max_entries that has 96 * upper most bit set in u32 space is undefined behavior due to 97 * resulting 1U << 32, so do it manually here in u64 space. 98 */ 99 mask64 = fls_long(max_entries - 1); 100 mask64 = 1ULL << mask64; 101 mask64 -= 1; 102 103 index_mask = mask64; 104 if (!bypass_spec_v1) { 105 /* round up array size to nearest power of 2, 106 * since cpu will speculate within index_mask limits 107 */ 108 max_entries = index_mask + 1; 109 /* Check for overflows. */ 110 if (max_entries < attr->max_entries) 111 return ERR_PTR(-E2BIG); 112 } 113 114 array_size = sizeof(*array); 115 if (percpu) { 116 array_size += (u64) max_entries * sizeof(void *); 117 } else { 118 /* rely on vmalloc() to return page-aligned memory and 119 * ensure array->value is exactly page-aligned 120 */ 121 if (attr->map_flags & BPF_F_MMAPABLE) { 122 array_size = PAGE_ALIGN(array_size); 123 array_size += PAGE_ALIGN((u64) max_entries * elem_size); 124 } else { 125 array_size += (u64) max_entries * elem_size; 126 } 127 } 128 129 /* make sure there is no u32 overflow later in round_up() */ 130 cost = array_size; 131 if (percpu) 132 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 133 134 ret = bpf_map_charge_init(&mem, cost); 135 if (ret < 0) 136 return ERR_PTR(ret); 137 138 /* allocate all map elements and zero-initialize them */ 139 if (attr->map_flags & BPF_F_MMAPABLE) { 140 void *data; 141 142 /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ 143 data = bpf_map_area_mmapable_alloc(array_size, numa_node); 144 if (!data) { 145 bpf_map_charge_finish(&mem); 146 return ERR_PTR(-ENOMEM); 147 } 148 array = data + PAGE_ALIGN(sizeof(struct bpf_array)) 149 - offsetof(struct bpf_array, value); 150 } else { 151 array = bpf_map_area_alloc(array_size, numa_node); 152 } 153 if (!array) { 154 bpf_map_charge_finish(&mem); 155 return ERR_PTR(-ENOMEM); 156 } 157 array->index_mask = index_mask; 158 array->map.bypass_spec_v1 = bypass_spec_v1; 159 160 /* copy mandatory map attributes */ 161 bpf_map_init_from_attr(&array->map, attr); 162 bpf_map_charge_move(&array->map.memory, &mem); 163 array->elem_size = elem_size; 164 165 if (percpu && bpf_array_alloc_percpu(array)) { 166 bpf_map_charge_finish(&array->map.memory); 167 bpf_map_area_free(array); 168 return ERR_PTR(-ENOMEM); 169 } 170 171 return &array->map; 172 } 173 174 /* Called from syscall or from eBPF program */ 175 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 176 { 177 struct bpf_array *array = container_of(map, struct bpf_array, map); 178 u32 index = *(u32 *)key; 179 180 if (unlikely(index >= array->map.max_entries)) 181 return NULL; 182 183 return array->value + array->elem_size * (index & array->index_mask); 184 } 185 186 static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, 187 u32 off) 188 { 189 struct bpf_array *array = container_of(map, struct bpf_array, map); 190 191 if (map->max_entries != 1) 192 return -ENOTSUPP; 193 if (off >= map->value_size) 194 return -EINVAL; 195 196 *imm = (unsigned long)array->value; 197 return 0; 198 } 199 200 static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, 201 u32 *off) 202 { 203 struct bpf_array *array = container_of(map, struct bpf_array, map); 204 u64 base = (unsigned long)array->value; 205 u64 range = array->elem_size; 206 207 if (map->max_entries != 1) 208 return -ENOTSUPP; 209 if (imm < base || imm >= base + range) 210 return -ENOENT; 211 212 *off = imm - base; 213 return 0; 214 } 215 216 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 217 static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 218 { 219 struct bpf_array *array = container_of(map, struct bpf_array, map); 220 struct bpf_insn *insn = insn_buf; 221 u32 elem_size = round_up(map->value_size, 8); 222 const int ret = BPF_REG_0; 223 const int map_ptr = BPF_REG_1; 224 const int index = BPF_REG_2; 225 226 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 227 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 228 if (!map->bypass_spec_v1) { 229 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); 230 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 231 } else { 232 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 233 } 234 235 if (is_power_of_2(elem_size)) { 236 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 237 } else { 238 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 239 } 240 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 241 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 242 *insn++ = BPF_MOV64_IMM(ret, 0); 243 return insn - insn_buf; 244 } 245 246 /* Called from eBPF program */ 247 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 248 { 249 struct bpf_array *array = container_of(map, struct bpf_array, map); 250 u32 index = *(u32 *)key; 251 252 if (unlikely(index >= array->map.max_entries)) 253 return NULL; 254 255 return this_cpu_ptr(array->pptrs[index & array->index_mask]); 256 } 257 258 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 259 { 260 struct bpf_array *array = container_of(map, struct bpf_array, map); 261 u32 index = *(u32 *)key; 262 void __percpu *pptr; 263 int cpu, off = 0; 264 u32 size; 265 266 if (unlikely(index >= array->map.max_entries)) 267 return -ENOENT; 268 269 /* per_cpu areas are zero-filled and bpf programs can only 270 * access 'value_size' of them, so copying rounded areas 271 * will not leak any kernel data 272 */ 273 size = round_up(map->value_size, 8); 274 rcu_read_lock(); 275 pptr = array->pptrs[index & array->index_mask]; 276 for_each_possible_cpu(cpu) { 277 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 278 off += size; 279 } 280 rcu_read_unlock(); 281 return 0; 282 } 283 284 /* Called from syscall */ 285 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 286 { 287 struct bpf_array *array = container_of(map, struct bpf_array, map); 288 u32 index = key ? *(u32 *)key : U32_MAX; 289 u32 *next = (u32 *)next_key; 290 291 if (index >= array->map.max_entries) { 292 *next = 0; 293 return 0; 294 } 295 296 if (index == array->map.max_entries - 1) 297 return -ENOENT; 298 299 *next = index + 1; 300 return 0; 301 } 302 303 /* Called from syscall or from eBPF program */ 304 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 305 u64 map_flags) 306 { 307 struct bpf_array *array = container_of(map, struct bpf_array, map); 308 u32 index = *(u32 *)key; 309 char *val; 310 311 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) 312 /* unknown flags */ 313 return -EINVAL; 314 315 if (unlikely(index >= array->map.max_entries)) 316 /* all elements were pre-allocated, cannot insert a new one */ 317 return -E2BIG; 318 319 if (unlikely(map_flags & BPF_NOEXIST)) 320 /* all elements already exist */ 321 return -EEXIST; 322 323 if (unlikely((map_flags & BPF_F_LOCK) && 324 !map_value_has_spin_lock(map))) 325 return -EINVAL; 326 327 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 328 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), 329 value, map->value_size); 330 } else { 331 val = array->value + 332 array->elem_size * (index & array->index_mask); 333 if (map_flags & BPF_F_LOCK) 334 copy_map_value_locked(map, val, value, false); 335 else 336 copy_map_value(map, val, value); 337 } 338 return 0; 339 } 340 341 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 342 u64 map_flags) 343 { 344 struct bpf_array *array = container_of(map, struct bpf_array, map); 345 u32 index = *(u32 *)key; 346 void __percpu *pptr; 347 int cpu, off = 0; 348 u32 size; 349 350 if (unlikely(map_flags > BPF_EXIST)) 351 /* unknown flags */ 352 return -EINVAL; 353 354 if (unlikely(index >= array->map.max_entries)) 355 /* all elements were pre-allocated, cannot insert a new one */ 356 return -E2BIG; 357 358 if (unlikely(map_flags == BPF_NOEXIST)) 359 /* all elements already exist */ 360 return -EEXIST; 361 362 /* the user space will provide round_up(value_size, 8) bytes that 363 * will be copied into per-cpu area. bpf programs can only access 364 * value_size of it. During lookup the same extra bytes will be 365 * returned or zeros which were zero-filled by percpu_alloc, 366 * so no kernel data leaks possible 367 */ 368 size = round_up(map->value_size, 8); 369 rcu_read_lock(); 370 pptr = array->pptrs[index & array->index_mask]; 371 for_each_possible_cpu(cpu) { 372 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 373 off += size; 374 } 375 rcu_read_unlock(); 376 return 0; 377 } 378 379 /* Called from syscall or from eBPF program */ 380 static int array_map_delete_elem(struct bpf_map *map, void *key) 381 { 382 return -EINVAL; 383 } 384 385 static void *array_map_vmalloc_addr(struct bpf_array *array) 386 { 387 return (void *)round_down((unsigned long)array, PAGE_SIZE); 388 } 389 390 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 391 static void array_map_free(struct bpf_map *map) 392 { 393 struct bpf_array *array = container_of(map, struct bpf_array, map); 394 395 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 396 bpf_array_free_percpu(array); 397 398 if (array->map.map_flags & BPF_F_MMAPABLE) 399 bpf_map_area_free(array_map_vmalloc_addr(array)); 400 else 401 bpf_map_area_free(array); 402 } 403 404 static void array_map_seq_show_elem(struct bpf_map *map, void *key, 405 struct seq_file *m) 406 { 407 void *value; 408 409 rcu_read_lock(); 410 411 value = array_map_lookup_elem(map, key); 412 if (!value) { 413 rcu_read_unlock(); 414 return; 415 } 416 417 if (map->btf_key_type_id) 418 seq_printf(m, "%u: ", *(u32 *)key); 419 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); 420 seq_puts(m, "\n"); 421 422 rcu_read_unlock(); 423 } 424 425 static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, 426 struct seq_file *m) 427 { 428 struct bpf_array *array = container_of(map, struct bpf_array, map); 429 u32 index = *(u32 *)key; 430 void __percpu *pptr; 431 int cpu; 432 433 rcu_read_lock(); 434 435 seq_printf(m, "%u: {\n", *(u32 *)key); 436 pptr = array->pptrs[index & array->index_mask]; 437 for_each_possible_cpu(cpu) { 438 seq_printf(m, "\tcpu%d: ", cpu); 439 btf_type_seq_show(map->btf, map->btf_value_type_id, 440 per_cpu_ptr(pptr, cpu), m); 441 seq_puts(m, "\n"); 442 } 443 seq_puts(m, "}\n"); 444 445 rcu_read_unlock(); 446 } 447 448 static int array_map_check_btf(const struct bpf_map *map, 449 const struct btf *btf, 450 const struct btf_type *key_type, 451 const struct btf_type *value_type) 452 { 453 u32 int_data; 454 455 /* One exception for keyless BTF: .bss/.data/.rodata map */ 456 if (btf_type_is_void(key_type)) { 457 if (map->map_type != BPF_MAP_TYPE_ARRAY || 458 map->max_entries != 1) 459 return -EINVAL; 460 461 if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) 462 return -EINVAL; 463 464 return 0; 465 } 466 467 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 468 return -EINVAL; 469 470 int_data = *(u32 *)(key_type + 1); 471 /* bpf array can only take a u32 key. This check makes sure 472 * that the btf matches the attr used during map_create. 473 */ 474 if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) 475 return -EINVAL; 476 477 return 0; 478 } 479 480 static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) 481 { 482 struct bpf_array *array = container_of(map, struct bpf_array, map); 483 pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; 484 485 if (!(map->map_flags & BPF_F_MMAPABLE)) 486 return -EINVAL; 487 488 if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > 489 PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) 490 return -EINVAL; 491 492 return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), 493 vma->vm_pgoff + pgoff); 494 } 495 496 static bool array_map_meta_equal(const struct bpf_map *meta0, 497 const struct bpf_map *meta1) 498 { 499 return meta0->max_entries == meta1->max_entries && 500 bpf_map_meta_equal(meta0, meta1); 501 } 502 503 struct bpf_iter_seq_array_map_info { 504 struct bpf_map *map; 505 void *percpu_value_buf; 506 u32 index; 507 }; 508 509 static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) 510 { 511 struct bpf_iter_seq_array_map_info *info = seq->private; 512 struct bpf_map *map = info->map; 513 struct bpf_array *array; 514 u32 index; 515 516 if (info->index >= map->max_entries) 517 return NULL; 518 519 if (*pos == 0) 520 ++*pos; 521 array = container_of(map, struct bpf_array, map); 522 index = info->index & array->index_mask; 523 if (info->percpu_value_buf) 524 return array->pptrs[index]; 525 return array->value + array->elem_size * index; 526 } 527 528 static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) 529 { 530 struct bpf_iter_seq_array_map_info *info = seq->private; 531 struct bpf_map *map = info->map; 532 struct bpf_array *array; 533 u32 index; 534 535 ++*pos; 536 ++info->index; 537 if (info->index >= map->max_entries) 538 return NULL; 539 540 array = container_of(map, struct bpf_array, map); 541 index = info->index & array->index_mask; 542 if (info->percpu_value_buf) 543 return array->pptrs[index]; 544 return array->value + array->elem_size * index; 545 } 546 547 static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) 548 { 549 struct bpf_iter_seq_array_map_info *info = seq->private; 550 struct bpf_iter__bpf_map_elem ctx = {}; 551 struct bpf_map *map = info->map; 552 struct bpf_iter_meta meta; 553 struct bpf_prog *prog; 554 int off = 0, cpu = 0; 555 void __percpu **pptr; 556 u32 size; 557 558 meta.seq = seq; 559 prog = bpf_iter_get_info(&meta, v == NULL); 560 if (!prog) 561 return 0; 562 563 ctx.meta = &meta; 564 ctx.map = info->map; 565 if (v) { 566 ctx.key = &info->index; 567 568 if (!info->percpu_value_buf) { 569 ctx.value = v; 570 } else { 571 pptr = v; 572 size = round_up(map->value_size, 8); 573 for_each_possible_cpu(cpu) { 574 bpf_long_memcpy(info->percpu_value_buf + off, 575 per_cpu_ptr(pptr, cpu), 576 size); 577 off += size; 578 } 579 ctx.value = info->percpu_value_buf; 580 } 581 } 582 583 return bpf_iter_run_prog(prog, &ctx); 584 } 585 586 static int bpf_array_map_seq_show(struct seq_file *seq, void *v) 587 { 588 return __bpf_array_map_seq_show(seq, v); 589 } 590 591 static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) 592 { 593 if (!v) 594 (void)__bpf_array_map_seq_show(seq, NULL); 595 } 596 597 static int bpf_iter_init_array_map(void *priv_data, 598 struct bpf_iter_aux_info *aux) 599 { 600 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 601 struct bpf_map *map = aux->map; 602 void *value_buf; 603 u32 buf_size; 604 605 if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 606 buf_size = round_up(map->value_size, 8) * num_possible_cpus(); 607 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); 608 if (!value_buf) 609 return -ENOMEM; 610 611 seq_info->percpu_value_buf = value_buf; 612 } 613 614 seq_info->map = map; 615 return 0; 616 } 617 618 static void bpf_iter_fini_array_map(void *priv_data) 619 { 620 struct bpf_iter_seq_array_map_info *seq_info = priv_data; 621 622 kfree(seq_info->percpu_value_buf); 623 } 624 625 static const struct seq_operations bpf_array_map_seq_ops = { 626 .start = bpf_array_map_seq_start, 627 .next = bpf_array_map_seq_next, 628 .stop = bpf_array_map_seq_stop, 629 .show = bpf_array_map_seq_show, 630 }; 631 632 static const struct bpf_iter_seq_info iter_seq_info = { 633 .seq_ops = &bpf_array_map_seq_ops, 634 .init_seq_private = bpf_iter_init_array_map, 635 .fini_seq_private = bpf_iter_fini_array_map, 636 .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), 637 }; 638 639 static int array_map_btf_id; 640 const struct bpf_map_ops array_map_ops = { 641 .map_meta_equal = array_map_meta_equal, 642 .map_alloc_check = array_map_alloc_check, 643 .map_alloc = array_map_alloc, 644 .map_free = array_map_free, 645 .map_get_next_key = array_map_get_next_key, 646 .map_lookup_elem = array_map_lookup_elem, 647 .map_update_elem = array_map_update_elem, 648 .map_delete_elem = array_map_delete_elem, 649 .map_gen_lookup = array_map_gen_lookup, 650 .map_direct_value_addr = array_map_direct_value_addr, 651 .map_direct_value_meta = array_map_direct_value_meta, 652 .map_mmap = array_map_mmap, 653 .map_seq_show_elem = array_map_seq_show_elem, 654 .map_check_btf = array_map_check_btf, 655 .map_lookup_batch = generic_map_lookup_batch, 656 .map_update_batch = generic_map_update_batch, 657 .map_btf_name = "bpf_array", 658 .map_btf_id = &array_map_btf_id, 659 .iter_seq_info = &iter_seq_info, 660 }; 661 662 static int percpu_array_map_btf_id; 663 const struct bpf_map_ops percpu_array_map_ops = { 664 .map_meta_equal = bpf_map_meta_equal, 665 .map_alloc_check = array_map_alloc_check, 666 .map_alloc = array_map_alloc, 667 .map_free = array_map_free, 668 .map_get_next_key = array_map_get_next_key, 669 .map_lookup_elem = percpu_array_map_lookup_elem, 670 .map_update_elem = array_map_update_elem, 671 .map_delete_elem = array_map_delete_elem, 672 .map_seq_show_elem = percpu_array_map_seq_show_elem, 673 .map_check_btf = array_map_check_btf, 674 .map_btf_name = "bpf_array", 675 .map_btf_id = &percpu_array_map_btf_id, 676 .iter_seq_info = &iter_seq_info, 677 }; 678 679 static int fd_array_map_alloc_check(union bpf_attr *attr) 680 { 681 /* only file descriptors can be stored in this type of map */ 682 if (attr->value_size != sizeof(u32)) 683 return -EINVAL; 684 /* Program read-only/write-only not supported for special maps yet. */ 685 if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) 686 return -EINVAL; 687 return array_map_alloc_check(attr); 688 } 689 690 static void fd_array_map_free(struct bpf_map *map) 691 { 692 struct bpf_array *array = container_of(map, struct bpf_array, map); 693 int i; 694 695 /* make sure it's empty */ 696 for (i = 0; i < array->map.max_entries; i++) 697 BUG_ON(array->ptrs[i] != NULL); 698 699 bpf_map_area_free(array); 700 } 701 702 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 703 { 704 return ERR_PTR(-EOPNOTSUPP); 705 } 706 707 /* only called from syscall */ 708 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) 709 { 710 void **elem, *ptr; 711 int ret = 0; 712 713 if (!map->ops->map_fd_sys_lookup_elem) 714 return -ENOTSUPP; 715 716 rcu_read_lock(); 717 elem = array_map_lookup_elem(map, key); 718 if (elem && (ptr = READ_ONCE(*elem))) 719 *value = map->ops->map_fd_sys_lookup_elem(ptr); 720 else 721 ret = -ENOENT; 722 rcu_read_unlock(); 723 724 return ret; 725 } 726 727 /* only called from syscall */ 728 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 729 void *key, void *value, u64 map_flags) 730 { 731 struct bpf_array *array = container_of(map, struct bpf_array, map); 732 void *new_ptr, *old_ptr; 733 u32 index = *(u32 *)key, ufd; 734 735 if (map_flags != BPF_ANY) 736 return -EINVAL; 737 738 if (index >= array->map.max_entries) 739 return -E2BIG; 740 741 ufd = *(u32 *)value; 742 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 743 if (IS_ERR(new_ptr)) 744 return PTR_ERR(new_ptr); 745 746 if (map->ops->map_poke_run) { 747 mutex_lock(&array->aux->poke_mutex); 748 old_ptr = xchg(array->ptrs + index, new_ptr); 749 map->ops->map_poke_run(map, index, old_ptr, new_ptr); 750 mutex_unlock(&array->aux->poke_mutex); 751 } else { 752 old_ptr = xchg(array->ptrs + index, new_ptr); 753 } 754 755 if (old_ptr) 756 map->ops->map_fd_put_ptr(old_ptr); 757 return 0; 758 } 759 760 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 761 { 762 struct bpf_array *array = container_of(map, struct bpf_array, map); 763 void *old_ptr; 764 u32 index = *(u32 *)key; 765 766 if (index >= array->map.max_entries) 767 return -E2BIG; 768 769 if (map->ops->map_poke_run) { 770 mutex_lock(&array->aux->poke_mutex); 771 old_ptr = xchg(array->ptrs + index, NULL); 772 map->ops->map_poke_run(map, index, old_ptr, NULL); 773 mutex_unlock(&array->aux->poke_mutex); 774 } else { 775 old_ptr = xchg(array->ptrs + index, NULL); 776 } 777 778 if (old_ptr) { 779 map->ops->map_fd_put_ptr(old_ptr); 780 return 0; 781 } else { 782 return -ENOENT; 783 } 784 } 785 786 static void *prog_fd_array_get_ptr(struct bpf_map *map, 787 struct file *map_file, int fd) 788 { 789 struct bpf_array *array = container_of(map, struct bpf_array, map); 790 struct bpf_prog *prog = bpf_prog_get(fd); 791 792 if (IS_ERR(prog)) 793 return prog; 794 795 if (!bpf_prog_array_compatible(array, prog)) { 796 bpf_prog_put(prog); 797 return ERR_PTR(-EINVAL); 798 } 799 800 return prog; 801 } 802 803 static void prog_fd_array_put_ptr(void *ptr) 804 { 805 bpf_prog_put(ptr); 806 } 807 808 static u32 prog_fd_array_sys_lookup_elem(void *ptr) 809 { 810 return ((struct bpf_prog *)ptr)->aux->id; 811 } 812 813 /* decrement refcnt of all bpf_progs that are stored in this map */ 814 static void bpf_fd_array_map_clear(struct bpf_map *map) 815 { 816 struct bpf_array *array = container_of(map, struct bpf_array, map); 817 int i; 818 819 for (i = 0; i < array->map.max_entries; i++) 820 fd_array_map_delete_elem(map, &i); 821 } 822 823 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, 824 struct seq_file *m) 825 { 826 void **elem, *ptr; 827 u32 prog_id; 828 829 rcu_read_lock(); 830 831 elem = array_map_lookup_elem(map, key); 832 if (elem) { 833 ptr = READ_ONCE(*elem); 834 if (ptr) { 835 seq_printf(m, "%u: ", *(u32 *)key); 836 prog_id = prog_fd_array_sys_lookup_elem(ptr); 837 btf_type_seq_show(map->btf, map->btf_value_type_id, 838 &prog_id, m); 839 seq_puts(m, "\n"); 840 } 841 } 842 843 rcu_read_unlock(); 844 } 845 846 struct prog_poke_elem { 847 struct list_head list; 848 struct bpf_prog_aux *aux; 849 }; 850 851 static int prog_array_map_poke_track(struct bpf_map *map, 852 struct bpf_prog_aux *prog_aux) 853 { 854 struct prog_poke_elem *elem; 855 struct bpf_array_aux *aux; 856 int ret = 0; 857 858 aux = container_of(map, struct bpf_array, map)->aux; 859 mutex_lock(&aux->poke_mutex); 860 list_for_each_entry(elem, &aux->poke_progs, list) { 861 if (elem->aux == prog_aux) 862 goto out; 863 } 864 865 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 866 if (!elem) { 867 ret = -ENOMEM; 868 goto out; 869 } 870 871 INIT_LIST_HEAD(&elem->list); 872 /* We must track the program's aux info at this point in time 873 * since the program pointer itself may not be stable yet, see 874 * also comment in prog_array_map_poke_run(). 875 */ 876 elem->aux = prog_aux; 877 878 list_add_tail(&elem->list, &aux->poke_progs); 879 out: 880 mutex_unlock(&aux->poke_mutex); 881 return ret; 882 } 883 884 static void prog_array_map_poke_untrack(struct bpf_map *map, 885 struct bpf_prog_aux *prog_aux) 886 { 887 struct prog_poke_elem *elem, *tmp; 888 struct bpf_array_aux *aux; 889 890 aux = container_of(map, struct bpf_array, map)->aux; 891 mutex_lock(&aux->poke_mutex); 892 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 893 if (elem->aux == prog_aux) { 894 list_del_init(&elem->list); 895 kfree(elem); 896 break; 897 } 898 } 899 mutex_unlock(&aux->poke_mutex); 900 } 901 902 static void prog_array_map_poke_run(struct bpf_map *map, u32 key, 903 struct bpf_prog *old, 904 struct bpf_prog *new) 905 { 906 u8 *old_addr, *new_addr, *old_bypass_addr; 907 struct prog_poke_elem *elem; 908 struct bpf_array_aux *aux; 909 910 aux = container_of(map, struct bpf_array, map)->aux; 911 WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); 912 913 list_for_each_entry(elem, &aux->poke_progs, list) { 914 struct bpf_jit_poke_descriptor *poke; 915 int i, ret; 916 917 for (i = 0; i < elem->aux->size_poke_tab; i++) { 918 poke = &elem->aux->poke_tab[i]; 919 920 /* Few things to be aware of: 921 * 922 * 1) We can only ever access aux in this context, but 923 * not aux->prog since it might not be stable yet and 924 * there could be danger of use after free otherwise. 925 * 2) Initially when we start tracking aux, the program 926 * is not JITed yet and also does not have a kallsyms 927 * entry. We skip these as poke->tailcall_target_stable 928 * is not active yet. The JIT will do the final fixup 929 * before setting it stable. The various 930 * poke->tailcall_target_stable are successively 931 * activated, so tail call updates can arrive from here 932 * while JIT is still finishing its final fixup for 933 * non-activated poke entries. 934 * 3) On program teardown, the program's kallsym entry gets 935 * removed out of RCU callback, but we can only untrack 936 * from sleepable context, therefore bpf_arch_text_poke() 937 * might not see that this is in BPF text section and 938 * bails out with -EINVAL. As these are unreachable since 939 * RCU grace period already passed, we simply skip them. 940 * 4) Also programs reaching refcount of zero while patching 941 * is in progress is okay since we're protected under 942 * poke_mutex and untrack the programs before the JIT 943 * buffer is freed. When we're still in the middle of 944 * patching and suddenly kallsyms entry of the program 945 * gets evicted, we just skip the rest which is fine due 946 * to point 3). 947 * 5) Any other error happening below from bpf_arch_text_poke() 948 * is a unexpected bug. 949 */ 950 if (!READ_ONCE(poke->tailcall_target_stable)) 951 continue; 952 if (poke->reason != BPF_POKE_REASON_TAIL_CALL) 953 continue; 954 if (poke->tail_call.map != map || 955 poke->tail_call.key != key) 956 continue; 957 958 old_bypass_addr = old ? NULL : poke->bypass_addr; 959 old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; 960 new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; 961 962 if (new) { 963 ret = bpf_arch_text_poke(poke->tailcall_target, 964 BPF_MOD_JUMP, 965 old_addr, new_addr); 966 BUG_ON(ret < 0 && ret != -EINVAL); 967 if (!old) { 968 ret = bpf_arch_text_poke(poke->tailcall_bypass, 969 BPF_MOD_JUMP, 970 poke->bypass_addr, 971 NULL); 972 BUG_ON(ret < 0 && ret != -EINVAL); 973 } 974 } else { 975 ret = bpf_arch_text_poke(poke->tailcall_bypass, 976 BPF_MOD_JUMP, 977 old_bypass_addr, 978 poke->bypass_addr); 979 BUG_ON(ret < 0 && ret != -EINVAL); 980 /* let other CPUs finish the execution of program 981 * so that it will not possible to expose them 982 * to invalid nop, stack unwind, nop state 983 */ 984 if (!ret) 985 synchronize_rcu(); 986 ret = bpf_arch_text_poke(poke->tailcall_target, 987 BPF_MOD_JUMP, 988 old_addr, NULL); 989 BUG_ON(ret < 0 && ret != -EINVAL); 990 } 991 } 992 } 993 } 994 995 static void prog_array_map_clear_deferred(struct work_struct *work) 996 { 997 struct bpf_map *map = container_of(work, struct bpf_array_aux, 998 work)->map; 999 bpf_fd_array_map_clear(map); 1000 bpf_map_put(map); 1001 } 1002 1003 static void prog_array_map_clear(struct bpf_map *map) 1004 { 1005 struct bpf_array_aux *aux = container_of(map, struct bpf_array, 1006 map)->aux; 1007 bpf_map_inc(map); 1008 schedule_work(&aux->work); 1009 } 1010 1011 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 1012 { 1013 struct bpf_array_aux *aux; 1014 struct bpf_map *map; 1015 1016 aux = kzalloc(sizeof(*aux), GFP_KERNEL); 1017 if (!aux) 1018 return ERR_PTR(-ENOMEM); 1019 1020 INIT_WORK(&aux->work, prog_array_map_clear_deferred); 1021 INIT_LIST_HEAD(&aux->poke_progs); 1022 mutex_init(&aux->poke_mutex); 1023 1024 map = array_map_alloc(attr); 1025 if (IS_ERR(map)) { 1026 kfree(aux); 1027 return map; 1028 } 1029 1030 container_of(map, struct bpf_array, map)->aux = aux; 1031 aux->map = map; 1032 1033 return map; 1034 } 1035 1036 static void prog_array_map_free(struct bpf_map *map) 1037 { 1038 struct prog_poke_elem *elem, *tmp; 1039 struct bpf_array_aux *aux; 1040 1041 aux = container_of(map, struct bpf_array, map)->aux; 1042 list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { 1043 list_del_init(&elem->list); 1044 kfree(elem); 1045 } 1046 kfree(aux); 1047 fd_array_map_free(map); 1048 } 1049 1050 /* prog_array->aux->{type,jited} is a runtime binding. 1051 * Doing static check alone in the verifier is not enough. 1052 * Thus, prog_array_map cannot be used as an inner_map 1053 * and map_meta_equal is not implemented. 1054 */ 1055 static int prog_array_map_btf_id; 1056 const struct bpf_map_ops prog_array_map_ops = { 1057 .map_alloc_check = fd_array_map_alloc_check, 1058 .map_alloc = prog_array_map_alloc, 1059 .map_free = prog_array_map_free, 1060 .map_poke_track = prog_array_map_poke_track, 1061 .map_poke_untrack = prog_array_map_poke_untrack, 1062 .map_poke_run = prog_array_map_poke_run, 1063 .map_get_next_key = array_map_get_next_key, 1064 .map_lookup_elem = fd_array_map_lookup_elem, 1065 .map_delete_elem = fd_array_map_delete_elem, 1066 .map_fd_get_ptr = prog_fd_array_get_ptr, 1067 .map_fd_put_ptr = prog_fd_array_put_ptr, 1068 .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, 1069 .map_release_uref = prog_array_map_clear, 1070 .map_seq_show_elem = prog_array_map_seq_show_elem, 1071 .map_btf_name = "bpf_array", 1072 .map_btf_id = &prog_array_map_btf_id, 1073 }; 1074 1075 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 1076 struct file *map_file) 1077 { 1078 struct bpf_event_entry *ee; 1079 1080 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 1081 if (ee) { 1082 ee->event = perf_file->private_data; 1083 ee->perf_file = perf_file; 1084 ee->map_file = map_file; 1085 } 1086 1087 return ee; 1088 } 1089 1090 static void __bpf_event_entry_free(struct rcu_head *rcu) 1091 { 1092 struct bpf_event_entry *ee; 1093 1094 ee = container_of(rcu, struct bpf_event_entry, rcu); 1095 fput(ee->perf_file); 1096 kfree(ee); 1097 } 1098 1099 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 1100 { 1101 call_rcu(&ee->rcu, __bpf_event_entry_free); 1102 } 1103 1104 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 1105 struct file *map_file, int fd) 1106 { 1107 struct bpf_event_entry *ee; 1108 struct perf_event *event; 1109 struct file *perf_file; 1110 u64 value; 1111 1112 perf_file = perf_event_get(fd); 1113 if (IS_ERR(perf_file)) 1114 return perf_file; 1115 1116 ee = ERR_PTR(-EOPNOTSUPP); 1117 event = perf_file->private_data; 1118 if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) 1119 goto err_out; 1120 1121 ee = bpf_event_entry_gen(perf_file, map_file); 1122 if (ee) 1123 return ee; 1124 ee = ERR_PTR(-ENOMEM); 1125 err_out: 1126 fput(perf_file); 1127 return ee; 1128 } 1129 1130 static void perf_event_fd_array_put_ptr(void *ptr) 1131 { 1132 bpf_event_entry_free_rcu(ptr); 1133 } 1134 1135 static void perf_event_fd_array_release(struct bpf_map *map, 1136 struct file *map_file) 1137 { 1138 struct bpf_array *array = container_of(map, struct bpf_array, map); 1139 struct bpf_event_entry *ee; 1140 int i; 1141 1142 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1143 return; 1144 1145 rcu_read_lock(); 1146 for (i = 0; i < array->map.max_entries; i++) { 1147 ee = READ_ONCE(array->ptrs[i]); 1148 if (ee && ee->map_file == map_file) 1149 fd_array_map_delete_elem(map, &i); 1150 } 1151 rcu_read_unlock(); 1152 } 1153 1154 static void perf_event_fd_array_map_free(struct bpf_map *map) 1155 { 1156 if (map->map_flags & BPF_F_PRESERVE_ELEMS) 1157 bpf_fd_array_map_clear(map); 1158 fd_array_map_free(map); 1159 } 1160 1161 static int perf_event_array_map_btf_id; 1162 const struct bpf_map_ops perf_event_array_map_ops = { 1163 .map_meta_equal = bpf_map_meta_equal, 1164 .map_alloc_check = fd_array_map_alloc_check, 1165 .map_alloc = array_map_alloc, 1166 .map_free = perf_event_fd_array_map_free, 1167 .map_get_next_key = array_map_get_next_key, 1168 .map_lookup_elem = fd_array_map_lookup_elem, 1169 .map_delete_elem = fd_array_map_delete_elem, 1170 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 1171 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 1172 .map_release = perf_event_fd_array_release, 1173 .map_check_btf = map_check_no_btf, 1174 .map_btf_name = "bpf_array", 1175 .map_btf_id = &perf_event_array_map_btf_id, 1176 }; 1177 1178 #ifdef CONFIG_CGROUPS 1179 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 1180 struct file *map_file /* not used */, 1181 int fd) 1182 { 1183 return cgroup_get_from_fd(fd); 1184 } 1185 1186 static void cgroup_fd_array_put_ptr(void *ptr) 1187 { 1188 /* cgroup_put free cgrp after a rcu grace period */ 1189 cgroup_put(ptr); 1190 } 1191 1192 static void cgroup_fd_array_free(struct bpf_map *map) 1193 { 1194 bpf_fd_array_map_clear(map); 1195 fd_array_map_free(map); 1196 } 1197 1198 static int cgroup_array_map_btf_id; 1199 const struct bpf_map_ops cgroup_array_map_ops = { 1200 .map_meta_equal = bpf_map_meta_equal, 1201 .map_alloc_check = fd_array_map_alloc_check, 1202 .map_alloc = array_map_alloc, 1203 .map_free = cgroup_fd_array_free, 1204 .map_get_next_key = array_map_get_next_key, 1205 .map_lookup_elem = fd_array_map_lookup_elem, 1206 .map_delete_elem = fd_array_map_delete_elem, 1207 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 1208 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 1209 .map_check_btf = map_check_no_btf, 1210 .map_btf_name = "bpf_array", 1211 .map_btf_id = &cgroup_array_map_btf_id, 1212 }; 1213 #endif 1214 1215 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) 1216 { 1217 struct bpf_map *map, *inner_map_meta; 1218 1219 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); 1220 if (IS_ERR(inner_map_meta)) 1221 return inner_map_meta; 1222 1223 map = array_map_alloc(attr); 1224 if (IS_ERR(map)) { 1225 bpf_map_meta_free(inner_map_meta); 1226 return map; 1227 } 1228 1229 map->inner_map_meta = inner_map_meta; 1230 1231 return map; 1232 } 1233 1234 static void array_of_map_free(struct bpf_map *map) 1235 { 1236 /* map->inner_map_meta is only accessed by syscall which 1237 * is protected by fdget/fdput. 1238 */ 1239 bpf_map_meta_free(map->inner_map_meta); 1240 bpf_fd_array_map_clear(map); 1241 fd_array_map_free(map); 1242 } 1243 1244 static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) 1245 { 1246 struct bpf_map **inner_map = array_map_lookup_elem(map, key); 1247 1248 if (!inner_map) 1249 return NULL; 1250 1251 return READ_ONCE(*inner_map); 1252 } 1253 1254 static u32 array_of_map_gen_lookup(struct bpf_map *map, 1255 struct bpf_insn *insn_buf) 1256 { 1257 struct bpf_array *array = container_of(map, struct bpf_array, map); 1258 u32 elem_size = round_up(map->value_size, 8); 1259 struct bpf_insn *insn = insn_buf; 1260 const int ret = BPF_REG_0; 1261 const int map_ptr = BPF_REG_1; 1262 const int index = BPF_REG_2; 1263 1264 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 1265 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 1266 if (!map->bypass_spec_v1) { 1267 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); 1268 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); 1269 } else { 1270 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 1271 } 1272 if (is_power_of_2(elem_size)) 1273 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 1274 else 1275 *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); 1276 *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); 1277 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); 1278 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); 1279 *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); 1280 *insn++ = BPF_MOV64_IMM(ret, 0); 1281 1282 return insn - insn_buf; 1283 } 1284 1285 static int array_of_maps_map_btf_id; 1286 const struct bpf_map_ops array_of_maps_map_ops = { 1287 .map_alloc_check = fd_array_map_alloc_check, 1288 .map_alloc = array_of_map_alloc, 1289 .map_free = array_of_map_free, 1290 .map_get_next_key = array_map_get_next_key, 1291 .map_lookup_elem = array_of_map_lookup_elem, 1292 .map_delete_elem = fd_array_map_delete_elem, 1293 .map_fd_get_ptr = bpf_map_fd_get_ptr, 1294 .map_fd_put_ptr = bpf_map_fd_put_ptr, 1295 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, 1296 .map_gen_lookup = array_of_map_gen_lookup, 1297 .map_check_btf = map_check_no_btf, 1298 .map_btf_name = "bpf_array", 1299 .map_btf_id = &array_of_maps_map_btf_id, 1300 }; 1301