1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/bpf-cgroup.h> 3 #include <linux/bpf.h> 4 #include <linux/bpf_local_storage.h> 5 #include <linux/btf.h> 6 #include <linux/bug.h> 7 #include <linux/filter.h> 8 #include <linux/mm.h> 9 #include <linux/rbtree.h> 10 #include <linux/slab.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/btf_ids.h> 13 14 #ifdef CONFIG_CGROUP_BPF 15 16 #include "../cgroup/cgroup-internal.h" 17 18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \ 19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) 20 21 struct bpf_cgroup_storage_map { 22 struct bpf_map map; 23 24 spinlock_t lock; 25 struct rb_root root; 26 struct list_head list; 27 }; 28 29 static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) 30 { 31 return container_of(map, struct bpf_cgroup_storage_map, map); 32 } 33 34 static bool attach_type_isolated(const struct bpf_map *map) 35 { 36 return map->key_size == sizeof(struct bpf_cgroup_storage_key); 37 } 38 39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, 40 const void *_key1, const void *_key2) 41 { 42 if (attach_type_isolated(&map->map)) { 43 const struct bpf_cgroup_storage_key *key1 = _key1; 44 const struct bpf_cgroup_storage_key *key2 = _key2; 45 46 if (key1->cgroup_inode_id < key2->cgroup_inode_id) 47 return -1; 48 else if (key1->cgroup_inode_id > key2->cgroup_inode_id) 49 return 1; 50 else if (key1->attach_type < key2->attach_type) 51 return -1; 52 else if (key1->attach_type > key2->attach_type) 53 return 1; 54 } else { 55 const __u64 *cgroup_inode_id1 = _key1; 56 const __u64 *cgroup_inode_id2 = _key2; 57 58 if (*cgroup_inode_id1 < *cgroup_inode_id2) 59 return -1; 60 else if (*cgroup_inode_id1 > *cgroup_inode_id2) 61 return 1; 62 } 63 return 0; 64 } 65 66 struct bpf_cgroup_storage * 67 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, 68 void *key, bool locked) 69 { 70 struct rb_root *root = &map->root; 71 struct rb_node *node; 72 73 if (!locked) 74 spin_lock_bh(&map->lock); 75 76 node = root->rb_node; 77 while (node) { 78 struct bpf_cgroup_storage *storage; 79 80 storage = container_of(node, struct bpf_cgroup_storage, node); 81 82 switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { 83 case -1: 84 node = node->rb_left; 85 break; 86 case 1: 87 node = node->rb_right; 88 break; 89 default: 90 if (!locked) 91 spin_unlock_bh(&map->lock); 92 return storage; 93 } 94 } 95 96 if (!locked) 97 spin_unlock_bh(&map->lock); 98 99 return NULL; 100 } 101 102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, 103 struct bpf_cgroup_storage *storage) 104 { 105 struct rb_root *root = &map->root; 106 struct rb_node **new = &(root->rb_node), *parent = NULL; 107 108 while (*new) { 109 struct bpf_cgroup_storage *this; 110 111 this = container_of(*new, struct bpf_cgroup_storage, node); 112 113 parent = *new; 114 switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { 115 case -1: 116 new = &((*new)->rb_left); 117 break; 118 case 1: 119 new = &((*new)->rb_right); 120 break; 121 default: 122 return -EEXIST; 123 } 124 } 125 126 rb_link_node(&storage->node, parent, new); 127 rb_insert_color(&storage->node, root); 128 129 return 0; 130 } 131 132 static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) 133 { 134 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 135 struct bpf_cgroup_storage *storage; 136 137 storage = cgroup_storage_lookup(map, key, false); 138 if (!storage) 139 return NULL; 140 141 return &READ_ONCE(storage->buf)->data[0]; 142 } 143 144 static long cgroup_storage_update_elem(struct bpf_map *map, void *key, 145 void *value, u64 flags) 146 { 147 struct bpf_cgroup_storage *storage; 148 struct bpf_storage_buffer *new; 149 150 if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) 151 return -EINVAL; 152 153 if (unlikely((flags & BPF_F_LOCK) && 154 !btf_record_has_field(map->record, BPF_SPIN_LOCK))) 155 return -EINVAL; 156 157 storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, 158 key, false); 159 if (!storage) 160 return -ENOENT; 161 162 if (flags & BPF_F_LOCK) { 163 copy_map_value_locked(map, storage->buf->data, value, false); 164 return 0; 165 } 166 167 new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), 168 __GFP_ZERO | GFP_NOWAIT, 169 map->numa_node); 170 if (!new) 171 return -ENOMEM; 172 173 memcpy(&new->data[0], value, map->value_size); 174 check_and_init_map_value(map, new->data); 175 176 new = xchg(&storage->buf, new); 177 kfree_rcu(new, rcu); 178 179 return 0; 180 } 181 182 int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, 183 void *value, u64 map_flags) 184 { 185 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 186 struct bpf_cgroup_storage *storage; 187 int cpu, off = 0; 188 u32 size; 189 190 rcu_read_lock(); 191 storage = cgroup_storage_lookup(map, key, false); 192 if (!storage) { 193 rcu_read_unlock(); 194 return -ENOENT; 195 } 196 197 /* per_cpu areas are zero-filled and bpf programs can only 198 * access 'value_size' of them, so copying rounded areas 199 * will not leak any kernel data 200 */ 201 if (map_flags & BPF_F_CPU) { 202 cpu = map_flags >> 32; 203 copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu)); 204 goto unlock; 205 } 206 size = round_up(_map->value_size, 8); 207 for_each_possible_cpu(cpu) { 208 copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); 209 off += size; 210 } 211 unlock: 212 rcu_read_unlock(); 213 return 0; 214 } 215 216 int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, 217 void *value, u64 map_flags) 218 { 219 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 220 struct bpf_cgroup_storage *storage; 221 void *val; 222 u32 size; 223 int cpu; 224 225 if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS)) 226 return -EINVAL; 227 228 rcu_read_lock(); 229 storage = cgroup_storage_lookup(map, key, false); 230 if (!storage) { 231 rcu_read_unlock(); 232 return -ENOENT; 233 } 234 235 /* the user space will provide round_up(value_size, 8) bytes that 236 * will be copied into per-cpu area. bpf programs can only access 237 * value_size of it. During lookup the same extra bytes will be 238 * returned or zeros which were zero-filled by percpu_alloc, 239 * so no kernel data leaks possible 240 */ 241 if (map_flags & BPF_F_CPU) { 242 cpu = map_flags >> 32; 243 copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value); 244 goto unlock; 245 } 246 size = round_up(_map->value_size, 8); 247 for_each_possible_cpu(cpu) { 248 val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; 249 copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val); 250 } 251 unlock: 252 rcu_read_unlock(); 253 return 0; 254 } 255 256 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, 257 void *_next_key) 258 { 259 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 260 struct bpf_cgroup_storage *storage; 261 262 spin_lock_bh(&map->lock); 263 264 if (list_empty(&map->list)) 265 goto enoent; 266 267 if (key) { 268 storage = cgroup_storage_lookup(map, key, true); 269 if (!storage) 270 goto enoent; 271 272 storage = list_next_entry(storage, list_map); 273 if (!storage) 274 goto enoent; 275 } else { 276 storage = list_first_entry(&map->list, 277 struct bpf_cgroup_storage, list_map); 278 } 279 280 spin_unlock_bh(&map->lock); 281 282 if (attach_type_isolated(&map->map)) { 283 struct bpf_cgroup_storage_key *next = _next_key; 284 *next = storage->key; 285 } else { 286 __u64 *next = _next_key; 287 *next = storage->key.cgroup_inode_id; 288 } 289 return 0; 290 291 enoent: 292 spin_unlock_bh(&map->lock); 293 return -ENOENT; 294 } 295 296 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) 297 { 298 __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; 299 int numa_node = bpf_map_attr_numa_node(attr); 300 struct bpf_cgroup_storage_map *map; 301 302 /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu 303 * is the same as other local storages. 304 */ 305 if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 306 max_value_size = min_t(__u32, max_value_size, 307 PCPU_MIN_UNIT_SIZE); 308 309 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 310 attr->key_size != sizeof(__u64)) 311 return ERR_PTR(-EINVAL); 312 313 if (attr->value_size == 0) 314 return ERR_PTR(-EINVAL); 315 316 if (attr->value_size > max_value_size) 317 return ERR_PTR(-E2BIG); 318 319 if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || 320 !bpf_map_flags_access_ok(attr->map_flags)) 321 return ERR_PTR(-EINVAL); 322 323 if (attr->max_entries) 324 /* max_entries is not used and enforced to be 0 */ 325 return ERR_PTR(-EINVAL); 326 327 map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); 328 if (!map) 329 return ERR_PTR(-ENOMEM); 330 331 /* copy mandatory map attributes */ 332 bpf_map_init_from_attr(&map->map, attr); 333 334 spin_lock_init(&map->lock); 335 map->root = RB_ROOT; 336 INIT_LIST_HEAD(&map->list); 337 338 return &map->map; 339 } 340 341 static void cgroup_storage_map_free(struct bpf_map *_map) 342 { 343 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 344 struct list_head *storages = &map->list; 345 struct bpf_cgroup_storage *storage, *stmp; 346 347 cgroup_lock(); 348 349 list_for_each_entry_safe(storage, stmp, storages, list_map) { 350 bpf_cgroup_storage_unlink(storage); 351 bpf_cgroup_storage_free(storage); 352 } 353 354 cgroup_unlock(); 355 356 WARN_ON(!RB_EMPTY_ROOT(&map->root)); 357 WARN_ON(!list_empty(&map->list)); 358 359 bpf_map_area_free(map); 360 } 361 362 static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) 363 { 364 return -EINVAL; 365 } 366 367 static int cgroup_storage_check_btf(const struct bpf_map *map, 368 const struct btf *btf, 369 const struct btf_type *key_type, 370 const struct btf_type *value_type) 371 { 372 if (attach_type_isolated(map)) { 373 struct btf_member *m; 374 u32 offset, size; 375 376 /* Key is expected to be of struct bpf_cgroup_storage_key type, 377 * which is: 378 * struct bpf_cgroup_storage_key { 379 * __u64 cgroup_inode_id; 380 * __u32 attach_type; 381 * }; 382 */ 383 384 /* 385 * Key_type must be a structure with two fields. 386 */ 387 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || 388 BTF_INFO_VLEN(key_type->info) != 2) 389 return -EINVAL; 390 391 /* 392 * The first field must be a 64 bit integer at 0 offset. 393 */ 394 m = (struct btf_member *)(key_type + 1); 395 size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); 396 if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) 397 return -EINVAL; 398 399 /* 400 * The second field must be a 32 bit integer at 64 bit offset. 401 */ 402 m++; 403 offset = offsetof(struct bpf_cgroup_storage_key, attach_type); 404 size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); 405 if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) 406 return -EINVAL; 407 } else { 408 /* 409 * Key is expected to be u64, which stores the cgroup_inode_id 410 */ 411 if (!btf_type_is_i64(key_type)) 412 return -EINVAL; 413 } 414 415 return 0; 416 } 417 418 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, 419 struct seq_file *m) 420 { 421 enum bpf_cgroup_storage_type stype; 422 struct bpf_cgroup_storage *storage; 423 int cpu; 424 425 rcu_read_lock(); 426 storage = cgroup_storage_lookup(map_to_storage(map), key, false); 427 if (!storage) { 428 rcu_read_unlock(); 429 return; 430 } 431 432 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 433 stype = cgroup_storage_type(map); 434 if (stype == BPF_CGROUP_STORAGE_SHARED) { 435 seq_puts(m, ": "); 436 btf_type_seq_show(map->btf, map->btf_value_type_id, 437 &READ_ONCE(storage->buf)->data[0], m); 438 seq_putc(m, '\n'); 439 } else { 440 seq_puts(m, ": {\n"); 441 for_each_possible_cpu(cpu) { 442 seq_printf(m, "\tcpu%d: ", cpu); 443 btf_type_seq_show(map->btf, map->btf_value_type_id, 444 per_cpu_ptr(storage->percpu_buf, cpu), 445 m); 446 seq_putc(m, '\n'); 447 } 448 seq_puts(m, "}\n"); 449 } 450 rcu_read_unlock(); 451 } 452 453 static u64 cgroup_storage_map_usage(const struct bpf_map *map) 454 { 455 /* Currently the dynamically allocated elements are not counted. */ 456 return sizeof(struct bpf_cgroup_storage_map); 457 } 458 459 BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, 460 bpf_cgroup_storage_map) 461 const struct bpf_map_ops cgroup_storage_map_ops = { 462 .map_alloc = cgroup_storage_map_alloc, 463 .map_free = cgroup_storage_map_free, 464 .map_get_next_key = cgroup_storage_get_next_key, 465 .map_lookup_elem = cgroup_storage_lookup_elem, 466 .map_update_elem = cgroup_storage_update_elem, 467 .map_delete_elem = cgroup_storage_delete_elem, 468 .map_check_btf = cgroup_storage_check_btf, 469 .map_seq_show_elem = cgroup_storage_seq_show_elem, 470 .map_mem_usage = cgroup_storage_map_usage, 471 .map_btf_id = &cgroup_storage_map_btf_ids[0], 472 }; 473 474 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) 475 { 476 enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); 477 478 if (aux->cgroup_storage[stype] && 479 aux->cgroup_storage[stype] != _map) 480 return -EBUSY; 481 482 aux->cgroup_storage[stype] = _map; 483 return 0; 484 } 485 486 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 487 { 488 size_t size; 489 490 if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 491 size = sizeof(struct bpf_storage_buffer) + map->value_size; 492 *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 493 PAGE_SIZE) >> PAGE_SHIFT; 494 } else { 495 size = map->value_size; 496 *pages = round_up(round_up(size, 8) * num_possible_cpus(), 497 PAGE_SIZE) >> PAGE_SHIFT; 498 } 499 500 return size; 501 } 502 503 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 504 enum bpf_cgroup_storage_type stype) 505 { 506 const gfp_t gfp = __GFP_ZERO | GFP_USER; 507 struct bpf_cgroup_storage *storage; 508 struct bpf_map *map; 509 size_t size; 510 u32 pages; 511 512 map = prog->aux->cgroup_storage[stype]; 513 if (!map) 514 return NULL; 515 516 size = bpf_cgroup_storage_calculate_size(map, &pages); 517 518 storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), 519 gfp, map->numa_node); 520 if (!storage) 521 goto enomem; 522 523 if (stype == BPF_CGROUP_STORAGE_SHARED) { 524 storage->buf = bpf_map_kmalloc_node(map, size, gfp, 525 map->numa_node); 526 if (!storage->buf) 527 goto enomem; 528 check_and_init_map_value(map, storage->buf->data); 529 } else { 530 storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); 531 if (!storage->percpu_buf) 532 goto enomem; 533 } 534 535 storage->map = (struct bpf_cgroup_storage_map *)map; 536 537 return storage; 538 539 enomem: 540 kfree(storage); 541 return ERR_PTR(-ENOMEM); 542 } 543 544 static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 545 { 546 struct bpf_cgroup_storage *storage = 547 container_of(rcu, struct bpf_cgroup_storage, rcu); 548 549 kfree(storage->buf); 550 kfree(storage); 551 } 552 553 static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 554 { 555 struct bpf_cgroup_storage *storage = 556 container_of(rcu, struct bpf_cgroup_storage, rcu); 557 558 free_percpu(storage->percpu_buf); 559 kfree(storage); 560 } 561 562 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 563 { 564 enum bpf_cgroup_storage_type stype; 565 struct bpf_map *map; 566 567 if (!storage) 568 return; 569 570 map = &storage->map->map; 571 stype = cgroup_storage_type(map); 572 if (stype == BPF_CGROUP_STORAGE_SHARED) 573 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 574 else 575 call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 576 } 577 578 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, 579 struct cgroup *cgroup, 580 enum bpf_attach_type type) 581 { 582 struct bpf_cgroup_storage_map *map; 583 584 if (!storage) 585 return; 586 587 storage->key.attach_type = type; 588 storage->key.cgroup_inode_id = cgroup_id(cgroup); 589 590 map = storage->map; 591 592 spin_lock_bh(&map->lock); 593 WARN_ON(cgroup_storage_insert(map, storage)); 594 list_add(&storage->list_map, &map->list); 595 list_add(&storage->list_cg, &cgroup->bpf.storages); 596 spin_unlock_bh(&map->lock); 597 } 598 599 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) 600 { 601 struct bpf_cgroup_storage_map *map; 602 struct rb_root *root; 603 604 if (!storage) 605 return; 606 607 map = storage->map; 608 609 spin_lock_bh(&map->lock); 610 root = &map->root; 611 rb_erase(&storage->node, root); 612 613 list_del(&storage->list_map); 614 list_del(&storage->list_cg); 615 spin_unlock_bh(&map->lock); 616 } 617 618 #endif 619