1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/bpf-cgroup.h> 3 #include <linux/bpf.h> 4 #include <linux/bpf_local_storage.h> 5 #include <linux/btf.h> 6 #include <linux/bug.h> 7 #include <linux/filter.h> 8 #include <linux/mm.h> 9 #include <linux/rbtree.h> 10 #include <linux/slab.h> 11 #include <uapi/linux/btf.h> 12 #include <linux/btf_ids.h> 13 14 #ifdef CONFIG_CGROUP_BPF 15 16 #include "../cgroup/cgroup-internal.h" 17 18 #define LOCAL_STORAGE_CREATE_FLAG_MASK \ 19 (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) 20 21 struct bpf_cgroup_storage_map { 22 struct bpf_map map; 23 24 spinlock_t lock; 25 struct rb_root root; 26 struct list_head list; 27 }; 28 29 static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) 30 { 31 return container_of(map, struct bpf_cgroup_storage_map, map); 32 } 33 34 static bool attach_type_isolated(const struct bpf_map *map) 35 { 36 return map->key_size == sizeof(struct bpf_cgroup_storage_key); 37 } 38 39 static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map, 40 const void *_key1, const void *_key2) 41 { 42 if (attach_type_isolated(&map->map)) { 43 const struct bpf_cgroup_storage_key *key1 = _key1; 44 const struct bpf_cgroup_storage_key *key2 = _key2; 45 46 if (key1->cgroup_inode_id < key2->cgroup_inode_id) 47 return -1; 48 else if (key1->cgroup_inode_id > key2->cgroup_inode_id) 49 return 1; 50 else if (key1->attach_type < key2->attach_type) 51 return -1; 52 else if (key1->attach_type > key2->attach_type) 53 return 1; 54 } else { 55 const __u64 *cgroup_inode_id1 = _key1; 56 const __u64 *cgroup_inode_id2 = _key2; 57 58 if (*cgroup_inode_id1 < *cgroup_inode_id2) 59 return -1; 60 else if (*cgroup_inode_id1 > *cgroup_inode_id2) 61 return 1; 62 } 63 return 0; 64 } 65 66 struct bpf_cgroup_storage * 67 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, 68 void *key, bool locked) 69 { 70 struct rb_root *root = &map->root; 71 struct rb_node *node; 72 73 if (!locked) 74 spin_lock_bh(&map->lock); 75 76 node = root->rb_node; 77 while (node) { 78 struct bpf_cgroup_storage *storage; 79 80 storage = container_of(node, struct bpf_cgroup_storage, node); 81 82 switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) { 83 case -1: 84 node = node->rb_left; 85 break; 86 case 1: 87 node = node->rb_right; 88 break; 89 default: 90 if (!locked) 91 spin_unlock_bh(&map->lock); 92 return storage; 93 } 94 } 95 96 if (!locked) 97 spin_unlock_bh(&map->lock); 98 99 return NULL; 100 } 101 102 static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, 103 struct bpf_cgroup_storage *storage) 104 { 105 struct rb_root *root = &map->root; 106 struct rb_node **new = &(root->rb_node), *parent = NULL; 107 108 while (*new) { 109 struct bpf_cgroup_storage *this; 110 111 this = container_of(*new, struct bpf_cgroup_storage, node); 112 113 parent = *new; 114 switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) { 115 case -1: 116 new = &((*new)->rb_left); 117 break; 118 case 1: 119 new = &((*new)->rb_right); 120 break; 121 default: 122 return -EEXIST; 123 } 124 } 125 126 rb_link_node(&storage->node, parent, new); 127 rb_insert_color(&storage->node, root); 128 129 return 0; 130 } 131 132 static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) 133 { 134 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 135 struct bpf_cgroup_storage *storage; 136 137 storage = cgroup_storage_lookup(map, key, false); 138 if (!storage) 139 return NULL; 140 141 return &READ_ONCE(storage->buf)->data[0]; 142 } 143 144 static int cgroup_storage_update_elem(struct bpf_map *map, void *key, 145 void *value, u64 flags) 146 { 147 struct bpf_cgroup_storage *storage; 148 struct bpf_storage_buffer *new; 149 150 if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST))) 151 return -EINVAL; 152 153 if (unlikely((flags & BPF_F_LOCK) && 154 !map_value_has_spin_lock(map))) 155 return -EINVAL; 156 157 storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, 158 key, false); 159 if (!storage) 160 return -ENOENT; 161 162 if (flags & BPF_F_LOCK) { 163 copy_map_value_locked(map, storage->buf->data, value, false); 164 return 0; 165 } 166 167 new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), 168 __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, 169 map->numa_node); 170 if (!new) 171 return -ENOMEM; 172 173 memcpy(&new->data[0], value, map->value_size); 174 check_and_init_map_value(map, new->data); 175 176 new = xchg(&storage->buf, new); 177 kfree_rcu(new, rcu); 178 179 return 0; 180 } 181 182 int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, 183 void *value) 184 { 185 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 186 struct bpf_cgroup_storage *storage; 187 int cpu, off = 0; 188 u32 size; 189 190 rcu_read_lock(); 191 storage = cgroup_storage_lookup(map, key, false); 192 if (!storage) { 193 rcu_read_unlock(); 194 return -ENOENT; 195 } 196 197 /* per_cpu areas are zero-filled and bpf programs can only 198 * access 'value_size' of them, so copying rounded areas 199 * will not leak any kernel data 200 */ 201 size = round_up(_map->value_size, 8); 202 for_each_possible_cpu(cpu) { 203 bpf_long_memcpy(value + off, 204 per_cpu_ptr(storage->percpu_buf, cpu), size); 205 off += size; 206 } 207 rcu_read_unlock(); 208 return 0; 209 } 210 211 int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, 212 void *value, u64 map_flags) 213 { 214 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 215 struct bpf_cgroup_storage *storage; 216 int cpu, off = 0; 217 u32 size; 218 219 if (map_flags != BPF_ANY && map_flags != BPF_EXIST) 220 return -EINVAL; 221 222 rcu_read_lock(); 223 storage = cgroup_storage_lookup(map, key, false); 224 if (!storage) { 225 rcu_read_unlock(); 226 return -ENOENT; 227 } 228 229 /* the user space will provide round_up(value_size, 8) bytes that 230 * will be copied into per-cpu area. bpf programs can only access 231 * value_size of it. During lookup the same extra bytes will be 232 * returned or zeros which were zero-filled by percpu_alloc, 233 * so no kernel data leaks possible 234 */ 235 size = round_up(_map->value_size, 8); 236 for_each_possible_cpu(cpu) { 237 bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), 238 value + off, size); 239 off += size; 240 } 241 rcu_read_unlock(); 242 return 0; 243 } 244 245 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, 246 void *_next_key) 247 { 248 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 249 struct bpf_cgroup_storage *storage; 250 251 spin_lock_bh(&map->lock); 252 253 if (list_empty(&map->list)) 254 goto enoent; 255 256 if (key) { 257 storage = cgroup_storage_lookup(map, key, true); 258 if (!storage) 259 goto enoent; 260 261 storage = list_next_entry(storage, list_map); 262 if (!storage) 263 goto enoent; 264 } else { 265 storage = list_first_entry(&map->list, 266 struct bpf_cgroup_storage, list_map); 267 } 268 269 spin_unlock_bh(&map->lock); 270 271 if (attach_type_isolated(&map->map)) { 272 struct bpf_cgroup_storage_key *next = _next_key; 273 *next = storage->key; 274 } else { 275 __u64 *next = _next_key; 276 *next = storage->key.cgroup_inode_id; 277 } 278 return 0; 279 280 enoent: 281 spin_unlock_bh(&map->lock); 282 return -ENOENT; 283 } 284 285 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) 286 { 287 __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; 288 int numa_node = bpf_map_attr_numa_node(attr); 289 struct bpf_cgroup_storage_map *map; 290 291 /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu 292 * is the same as other local storages. 293 */ 294 if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) 295 max_value_size = min_t(__u32, max_value_size, 296 PCPU_MIN_UNIT_SIZE); 297 298 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && 299 attr->key_size != sizeof(__u64)) 300 return ERR_PTR(-EINVAL); 301 302 if (attr->value_size == 0) 303 return ERR_PTR(-EINVAL); 304 305 if (attr->value_size > max_value_size) 306 return ERR_PTR(-E2BIG); 307 308 if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || 309 !bpf_map_flags_access_ok(attr->map_flags)) 310 return ERR_PTR(-EINVAL); 311 312 if (attr->max_entries) 313 /* max_entries is not used and enforced to be 0 */ 314 return ERR_PTR(-EINVAL); 315 316 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 317 __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); 318 if (!map) 319 return ERR_PTR(-ENOMEM); 320 321 /* copy mandatory map attributes */ 322 bpf_map_init_from_attr(&map->map, attr); 323 324 spin_lock_init(&map->lock); 325 map->root = RB_ROOT; 326 INIT_LIST_HEAD(&map->list); 327 328 return &map->map; 329 } 330 331 static void cgroup_storage_map_free(struct bpf_map *_map) 332 { 333 struct bpf_cgroup_storage_map *map = map_to_storage(_map); 334 struct list_head *storages = &map->list; 335 struct bpf_cgroup_storage *storage, *stmp; 336 337 mutex_lock(&cgroup_mutex); 338 339 list_for_each_entry_safe(storage, stmp, storages, list_map) { 340 bpf_cgroup_storage_unlink(storage); 341 bpf_cgroup_storage_free(storage); 342 } 343 344 mutex_unlock(&cgroup_mutex); 345 346 WARN_ON(!RB_EMPTY_ROOT(&map->root)); 347 WARN_ON(!list_empty(&map->list)); 348 349 kfree(map); 350 } 351 352 static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) 353 { 354 return -EINVAL; 355 } 356 357 static int cgroup_storage_check_btf(const struct bpf_map *map, 358 const struct btf *btf, 359 const struct btf_type *key_type, 360 const struct btf_type *value_type) 361 { 362 if (attach_type_isolated(map)) { 363 struct btf_member *m; 364 u32 offset, size; 365 366 /* Key is expected to be of struct bpf_cgroup_storage_key type, 367 * which is: 368 * struct bpf_cgroup_storage_key { 369 * __u64 cgroup_inode_id; 370 * __u32 attach_type; 371 * }; 372 */ 373 374 /* 375 * Key_type must be a structure with two fields. 376 */ 377 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || 378 BTF_INFO_VLEN(key_type->info) != 2) 379 return -EINVAL; 380 381 /* 382 * The first field must be a 64 bit integer at 0 offset. 383 */ 384 m = (struct btf_member *)(key_type + 1); 385 size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id); 386 if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) 387 return -EINVAL; 388 389 /* 390 * The second field must be a 32 bit integer at 64 bit offset. 391 */ 392 m++; 393 offset = offsetof(struct bpf_cgroup_storage_key, attach_type); 394 size = sizeof_field(struct bpf_cgroup_storage_key, attach_type); 395 if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) 396 return -EINVAL; 397 } else { 398 u32 int_data; 399 400 /* 401 * Key is expected to be u64, which stores the cgroup_inode_id 402 */ 403 404 if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) 405 return -EINVAL; 406 407 int_data = *(u32 *)(key_type + 1); 408 if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) 409 return -EINVAL; 410 } 411 412 return 0; 413 } 414 415 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, 416 struct seq_file *m) 417 { 418 enum bpf_cgroup_storage_type stype; 419 struct bpf_cgroup_storage *storage; 420 int cpu; 421 422 rcu_read_lock(); 423 storage = cgroup_storage_lookup(map_to_storage(map), key, false); 424 if (!storage) { 425 rcu_read_unlock(); 426 return; 427 } 428 429 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); 430 stype = cgroup_storage_type(map); 431 if (stype == BPF_CGROUP_STORAGE_SHARED) { 432 seq_puts(m, ": "); 433 btf_type_seq_show(map->btf, map->btf_value_type_id, 434 &READ_ONCE(storage->buf)->data[0], m); 435 seq_puts(m, "\n"); 436 } else { 437 seq_puts(m, ": {\n"); 438 for_each_possible_cpu(cpu) { 439 seq_printf(m, "\tcpu%d: ", cpu); 440 btf_type_seq_show(map->btf, map->btf_value_type_id, 441 per_cpu_ptr(storage->percpu_buf, cpu), 442 m); 443 seq_puts(m, "\n"); 444 } 445 seq_puts(m, "}\n"); 446 } 447 rcu_read_unlock(); 448 } 449 450 BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, 451 bpf_cgroup_storage_map) 452 const struct bpf_map_ops cgroup_storage_map_ops = { 453 .map_alloc = cgroup_storage_map_alloc, 454 .map_free = cgroup_storage_map_free, 455 .map_get_next_key = cgroup_storage_get_next_key, 456 .map_lookup_elem = cgroup_storage_lookup_elem, 457 .map_update_elem = cgroup_storage_update_elem, 458 .map_delete_elem = cgroup_storage_delete_elem, 459 .map_check_btf = cgroup_storage_check_btf, 460 .map_seq_show_elem = cgroup_storage_seq_show_elem, 461 .map_btf_id = &cgroup_storage_map_btf_ids[0], 462 }; 463 464 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) 465 { 466 enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); 467 468 if (aux->cgroup_storage[stype] && 469 aux->cgroup_storage[stype] != _map) 470 return -EBUSY; 471 472 aux->cgroup_storage[stype] = _map; 473 return 0; 474 } 475 476 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) 477 { 478 size_t size; 479 480 if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { 481 size = sizeof(struct bpf_storage_buffer) + map->value_size; 482 *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, 483 PAGE_SIZE) >> PAGE_SHIFT; 484 } else { 485 size = map->value_size; 486 *pages = round_up(round_up(size, 8) * num_possible_cpus(), 487 PAGE_SIZE) >> PAGE_SHIFT; 488 } 489 490 return size; 491 } 492 493 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, 494 enum bpf_cgroup_storage_type stype) 495 { 496 const gfp_t gfp = __GFP_ZERO | GFP_USER; 497 struct bpf_cgroup_storage *storage; 498 struct bpf_map *map; 499 size_t size; 500 u32 pages; 501 502 map = prog->aux->cgroup_storage[stype]; 503 if (!map) 504 return NULL; 505 506 size = bpf_cgroup_storage_calculate_size(map, &pages); 507 508 storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), 509 gfp, map->numa_node); 510 if (!storage) 511 goto enomem; 512 513 if (stype == BPF_CGROUP_STORAGE_SHARED) { 514 storage->buf = bpf_map_kmalloc_node(map, size, gfp, 515 map->numa_node); 516 if (!storage->buf) 517 goto enomem; 518 check_and_init_map_value(map, storage->buf->data); 519 } else { 520 storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); 521 if (!storage->percpu_buf) 522 goto enomem; 523 } 524 525 storage->map = (struct bpf_cgroup_storage_map *)map; 526 527 return storage; 528 529 enomem: 530 kfree(storage); 531 return ERR_PTR(-ENOMEM); 532 } 533 534 static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) 535 { 536 struct bpf_cgroup_storage *storage = 537 container_of(rcu, struct bpf_cgroup_storage, rcu); 538 539 kfree(storage->buf); 540 kfree(storage); 541 } 542 543 static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) 544 { 545 struct bpf_cgroup_storage *storage = 546 container_of(rcu, struct bpf_cgroup_storage, rcu); 547 548 free_percpu(storage->percpu_buf); 549 kfree(storage); 550 } 551 552 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) 553 { 554 enum bpf_cgroup_storage_type stype; 555 struct bpf_map *map; 556 557 if (!storage) 558 return; 559 560 map = &storage->map->map; 561 stype = cgroup_storage_type(map); 562 if (stype == BPF_CGROUP_STORAGE_SHARED) 563 call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); 564 else 565 call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); 566 } 567 568 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, 569 struct cgroup *cgroup, 570 enum bpf_attach_type type) 571 { 572 struct bpf_cgroup_storage_map *map; 573 574 if (!storage) 575 return; 576 577 storage->key.attach_type = type; 578 storage->key.cgroup_inode_id = cgroup_id(cgroup); 579 580 map = storage->map; 581 582 spin_lock_bh(&map->lock); 583 WARN_ON(cgroup_storage_insert(map, storage)); 584 list_add(&storage->list_map, &map->list); 585 list_add(&storage->list_cg, &cgroup->bpf.storages); 586 spin_unlock_bh(&map->lock); 587 } 588 589 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) 590 { 591 struct bpf_cgroup_storage_map *map; 592 struct rb_root *root; 593 594 if (!storage) 595 return; 596 597 map = storage->map; 598 599 spin_lock_bh(&map->lock); 600 root = &map->root; 601 rb_erase(&storage->node, root); 602 603 list_del(&storage->list_map); 604 list_del(&storage->list_cg); 605 spin_unlock_bh(&map->lock); 606 } 607 608 #endif 609