1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/err.h> 14 #include <linux/slab.h> 15 #include <linux/mm.h> 16 #include <linux/filter.h> 17 #include <linux/perf_event.h> 18 19 static void bpf_array_free_percpu(struct bpf_array *array) 20 { 21 int i; 22 23 for (i = 0; i < array->map.max_entries; i++) 24 free_percpu(array->pptrs[i]); 25 } 26 27 static int bpf_array_alloc_percpu(struct bpf_array *array) 28 { 29 void __percpu *ptr; 30 int i; 31 32 for (i = 0; i < array->map.max_entries; i++) { 33 ptr = __alloc_percpu_gfp(array->elem_size, 8, 34 GFP_USER | __GFP_NOWARN); 35 if (!ptr) { 36 bpf_array_free_percpu(array); 37 return -ENOMEM; 38 } 39 array->pptrs[i] = ptr; 40 } 41 42 return 0; 43 } 44 45 /* Called from syscall */ 46 static struct bpf_map *array_map_alloc(union bpf_attr *attr) 47 { 48 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 49 struct bpf_array *array; 50 u64 array_size; 51 u32 elem_size; 52 53 /* check sanity of attributes */ 54 if (attr->max_entries == 0 || attr->key_size != 4 || 55 attr->value_size == 0 || attr->map_flags) 56 return ERR_PTR(-EINVAL); 57 58 if (attr->value_size > KMALLOC_MAX_SIZE) 59 /* if value_size is bigger, the user space won't be able to 60 * access the elements. 61 */ 62 return ERR_PTR(-E2BIG); 63 64 elem_size = round_up(attr->value_size, 8); 65 66 array_size = sizeof(*array); 67 if (percpu) 68 array_size += (u64) attr->max_entries * sizeof(void *); 69 else 70 array_size += (u64) attr->max_entries * elem_size; 71 72 /* make sure there is no u32 overflow later in round_up() */ 73 if (array_size >= U32_MAX - PAGE_SIZE) 74 return ERR_PTR(-ENOMEM); 75 76 /* allocate all map elements and zero-initialize them */ 77 array = bpf_map_area_alloc(array_size); 78 if (!array) 79 return ERR_PTR(-ENOMEM); 80 81 /* copy mandatory map attributes */ 82 array->map.map_type = attr->map_type; 83 array->map.key_size = attr->key_size; 84 array->map.value_size = attr->value_size; 85 array->map.max_entries = attr->max_entries; 86 array->elem_size = elem_size; 87 88 if (!percpu) 89 goto out; 90 91 array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); 92 93 if (array_size >= U32_MAX - PAGE_SIZE || 94 elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { 95 bpf_map_area_free(array); 96 return ERR_PTR(-ENOMEM); 97 } 98 out: 99 array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; 100 101 return &array->map; 102 } 103 104 /* Called from syscall or from eBPF program */ 105 static void *array_map_lookup_elem(struct bpf_map *map, void *key) 106 { 107 struct bpf_array *array = container_of(map, struct bpf_array, map); 108 u32 index = *(u32 *)key; 109 110 if (unlikely(index >= array->map.max_entries)) 111 return NULL; 112 113 return array->value + array->elem_size * index; 114 } 115 116 /* Called from eBPF program */ 117 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) 118 { 119 struct bpf_array *array = container_of(map, struct bpf_array, map); 120 u32 index = *(u32 *)key; 121 122 if (unlikely(index >= array->map.max_entries)) 123 return NULL; 124 125 return this_cpu_ptr(array->pptrs[index]); 126 } 127 128 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 129 { 130 struct bpf_array *array = container_of(map, struct bpf_array, map); 131 u32 index = *(u32 *)key; 132 void __percpu *pptr; 133 int cpu, off = 0; 134 u32 size; 135 136 if (unlikely(index >= array->map.max_entries)) 137 return -ENOENT; 138 139 /* per_cpu areas are zero-filled and bpf programs can only 140 * access 'value_size' of them, so copying rounded areas 141 * will not leak any kernel data 142 */ 143 size = round_up(map->value_size, 8); 144 rcu_read_lock(); 145 pptr = array->pptrs[index]; 146 for_each_possible_cpu(cpu) { 147 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 148 off += size; 149 } 150 rcu_read_unlock(); 151 return 0; 152 } 153 154 /* Called from syscall */ 155 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 156 { 157 struct bpf_array *array = container_of(map, struct bpf_array, map); 158 u32 index = *(u32 *)key; 159 u32 *next = (u32 *)next_key; 160 161 if (index >= array->map.max_entries) { 162 *next = 0; 163 return 0; 164 } 165 166 if (index == array->map.max_entries - 1) 167 return -ENOENT; 168 169 *next = index + 1; 170 return 0; 171 } 172 173 /* Called from syscall or from eBPF program */ 174 static int array_map_update_elem(struct bpf_map *map, void *key, void *value, 175 u64 map_flags) 176 { 177 struct bpf_array *array = container_of(map, struct bpf_array, map); 178 u32 index = *(u32 *)key; 179 180 if (unlikely(map_flags > BPF_EXIST)) 181 /* unknown flags */ 182 return -EINVAL; 183 184 if (unlikely(index >= array->map.max_entries)) 185 /* all elements were pre-allocated, cannot insert a new one */ 186 return -E2BIG; 187 188 if (unlikely(map_flags == BPF_NOEXIST)) 189 /* all elements already exist */ 190 return -EEXIST; 191 192 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 193 memcpy(this_cpu_ptr(array->pptrs[index]), 194 value, map->value_size); 195 else 196 memcpy(array->value + array->elem_size * index, 197 value, map->value_size); 198 return 0; 199 } 200 201 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, 202 u64 map_flags) 203 { 204 struct bpf_array *array = container_of(map, struct bpf_array, map); 205 u32 index = *(u32 *)key; 206 void __percpu *pptr; 207 int cpu, off = 0; 208 u32 size; 209 210 if (unlikely(map_flags > BPF_EXIST)) 211 /* unknown flags */ 212 return -EINVAL; 213 214 if (unlikely(index >= array->map.max_entries)) 215 /* all elements were pre-allocated, cannot insert a new one */ 216 return -E2BIG; 217 218 if (unlikely(map_flags == BPF_NOEXIST)) 219 /* all elements already exist */ 220 return -EEXIST; 221 222 /* the user space will provide round_up(value_size, 8) bytes that 223 * will be copied into per-cpu area. bpf programs can only access 224 * value_size of it. During lookup the same extra bytes will be 225 * returned or zeros which were zero-filled by percpu_alloc, 226 * so no kernel data leaks possible 227 */ 228 size = round_up(map->value_size, 8); 229 rcu_read_lock(); 230 pptr = array->pptrs[index]; 231 for_each_possible_cpu(cpu) { 232 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 233 off += size; 234 } 235 rcu_read_unlock(); 236 return 0; 237 } 238 239 /* Called from syscall or from eBPF program */ 240 static int array_map_delete_elem(struct bpf_map *map, void *key) 241 { 242 return -EINVAL; 243 } 244 245 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ 246 static void array_map_free(struct bpf_map *map) 247 { 248 struct bpf_array *array = container_of(map, struct bpf_array, map); 249 250 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 251 * so the programs (can be more than one that used this map) were 252 * disconnected from events. Wait for outstanding programs to complete 253 * and free the array 254 */ 255 synchronize_rcu(); 256 257 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 258 bpf_array_free_percpu(array); 259 260 bpf_map_area_free(array); 261 } 262 263 static const struct bpf_map_ops array_ops = { 264 .map_alloc = array_map_alloc, 265 .map_free = array_map_free, 266 .map_get_next_key = array_map_get_next_key, 267 .map_lookup_elem = array_map_lookup_elem, 268 .map_update_elem = array_map_update_elem, 269 .map_delete_elem = array_map_delete_elem, 270 }; 271 272 static struct bpf_map_type_list array_type __ro_after_init = { 273 .ops = &array_ops, 274 .type = BPF_MAP_TYPE_ARRAY, 275 }; 276 277 static const struct bpf_map_ops percpu_array_ops = { 278 .map_alloc = array_map_alloc, 279 .map_free = array_map_free, 280 .map_get_next_key = array_map_get_next_key, 281 .map_lookup_elem = percpu_array_map_lookup_elem, 282 .map_update_elem = array_map_update_elem, 283 .map_delete_elem = array_map_delete_elem, 284 }; 285 286 static struct bpf_map_type_list percpu_array_type __ro_after_init = { 287 .ops = &percpu_array_ops, 288 .type = BPF_MAP_TYPE_PERCPU_ARRAY, 289 }; 290 291 static int __init register_array_map(void) 292 { 293 bpf_register_map_type(&array_type); 294 bpf_register_map_type(&percpu_array_type); 295 return 0; 296 } 297 late_initcall(register_array_map); 298 299 static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) 300 { 301 /* only file descriptors can be stored in this type of map */ 302 if (attr->value_size != sizeof(u32)) 303 return ERR_PTR(-EINVAL); 304 return array_map_alloc(attr); 305 } 306 307 static void fd_array_map_free(struct bpf_map *map) 308 { 309 struct bpf_array *array = container_of(map, struct bpf_array, map); 310 int i; 311 312 synchronize_rcu(); 313 314 /* make sure it's empty */ 315 for (i = 0; i < array->map.max_entries; i++) 316 BUG_ON(array->ptrs[i] != NULL); 317 318 bpf_map_area_free(array); 319 } 320 321 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) 322 { 323 return NULL; 324 } 325 326 /* only called from syscall */ 327 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, 328 void *key, void *value, u64 map_flags) 329 { 330 struct bpf_array *array = container_of(map, struct bpf_array, map); 331 void *new_ptr, *old_ptr; 332 u32 index = *(u32 *)key, ufd; 333 334 if (map_flags != BPF_ANY) 335 return -EINVAL; 336 337 if (index >= array->map.max_entries) 338 return -E2BIG; 339 340 ufd = *(u32 *)value; 341 new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); 342 if (IS_ERR(new_ptr)) 343 return PTR_ERR(new_ptr); 344 345 old_ptr = xchg(array->ptrs + index, new_ptr); 346 if (old_ptr) 347 map->ops->map_fd_put_ptr(old_ptr); 348 349 return 0; 350 } 351 352 static int fd_array_map_delete_elem(struct bpf_map *map, void *key) 353 { 354 struct bpf_array *array = container_of(map, struct bpf_array, map); 355 void *old_ptr; 356 u32 index = *(u32 *)key; 357 358 if (index >= array->map.max_entries) 359 return -E2BIG; 360 361 old_ptr = xchg(array->ptrs + index, NULL); 362 if (old_ptr) { 363 map->ops->map_fd_put_ptr(old_ptr); 364 return 0; 365 } else { 366 return -ENOENT; 367 } 368 } 369 370 static void *prog_fd_array_get_ptr(struct bpf_map *map, 371 struct file *map_file, int fd) 372 { 373 struct bpf_array *array = container_of(map, struct bpf_array, map); 374 struct bpf_prog *prog = bpf_prog_get(fd); 375 376 if (IS_ERR(prog)) 377 return prog; 378 379 if (!bpf_prog_array_compatible(array, prog)) { 380 bpf_prog_put(prog); 381 return ERR_PTR(-EINVAL); 382 } 383 384 return prog; 385 } 386 387 static void prog_fd_array_put_ptr(void *ptr) 388 { 389 bpf_prog_put(ptr); 390 } 391 392 /* decrement refcnt of all bpf_progs that are stored in this map */ 393 void bpf_fd_array_map_clear(struct bpf_map *map) 394 { 395 struct bpf_array *array = container_of(map, struct bpf_array, map); 396 int i; 397 398 for (i = 0; i < array->map.max_entries; i++) 399 fd_array_map_delete_elem(map, &i); 400 } 401 402 static const struct bpf_map_ops prog_array_ops = { 403 .map_alloc = fd_array_map_alloc, 404 .map_free = fd_array_map_free, 405 .map_get_next_key = array_map_get_next_key, 406 .map_lookup_elem = fd_array_map_lookup_elem, 407 .map_delete_elem = fd_array_map_delete_elem, 408 .map_fd_get_ptr = prog_fd_array_get_ptr, 409 .map_fd_put_ptr = prog_fd_array_put_ptr, 410 }; 411 412 static struct bpf_map_type_list prog_array_type __ro_after_init = { 413 .ops = &prog_array_ops, 414 .type = BPF_MAP_TYPE_PROG_ARRAY, 415 }; 416 417 static int __init register_prog_array_map(void) 418 { 419 bpf_register_map_type(&prog_array_type); 420 return 0; 421 } 422 late_initcall(register_prog_array_map); 423 424 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, 425 struct file *map_file) 426 { 427 struct bpf_event_entry *ee; 428 429 ee = kzalloc(sizeof(*ee), GFP_ATOMIC); 430 if (ee) { 431 ee->event = perf_file->private_data; 432 ee->perf_file = perf_file; 433 ee->map_file = map_file; 434 } 435 436 return ee; 437 } 438 439 static void __bpf_event_entry_free(struct rcu_head *rcu) 440 { 441 struct bpf_event_entry *ee; 442 443 ee = container_of(rcu, struct bpf_event_entry, rcu); 444 fput(ee->perf_file); 445 kfree(ee); 446 } 447 448 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) 449 { 450 call_rcu(&ee->rcu, __bpf_event_entry_free); 451 } 452 453 static void *perf_event_fd_array_get_ptr(struct bpf_map *map, 454 struct file *map_file, int fd) 455 { 456 const struct perf_event_attr *attr; 457 struct bpf_event_entry *ee; 458 struct perf_event *event; 459 struct file *perf_file; 460 461 perf_file = perf_event_get(fd); 462 if (IS_ERR(perf_file)) 463 return perf_file; 464 465 event = perf_file->private_data; 466 ee = ERR_PTR(-EINVAL); 467 468 attr = perf_event_attrs(event); 469 if (IS_ERR(attr) || attr->inherit) 470 goto err_out; 471 472 switch (attr->type) { 473 case PERF_TYPE_SOFTWARE: 474 if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) 475 goto err_out; 476 /* fall-through */ 477 case PERF_TYPE_RAW: 478 case PERF_TYPE_HARDWARE: 479 ee = bpf_event_entry_gen(perf_file, map_file); 480 if (ee) 481 return ee; 482 ee = ERR_PTR(-ENOMEM); 483 /* fall-through */ 484 default: 485 break; 486 } 487 488 err_out: 489 fput(perf_file); 490 return ee; 491 } 492 493 static void perf_event_fd_array_put_ptr(void *ptr) 494 { 495 bpf_event_entry_free_rcu(ptr); 496 } 497 498 static void perf_event_fd_array_release(struct bpf_map *map, 499 struct file *map_file) 500 { 501 struct bpf_array *array = container_of(map, struct bpf_array, map); 502 struct bpf_event_entry *ee; 503 int i; 504 505 rcu_read_lock(); 506 for (i = 0; i < array->map.max_entries; i++) { 507 ee = READ_ONCE(array->ptrs[i]); 508 if (ee && ee->map_file == map_file) 509 fd_array_map_delete_elem(map, &i); 510 } 511 rcu_read_unlock(); 512 } 513 514 static const struct bpf_map_ops perf_event_array_ops = { 515 .map_alloc = fd_array_map_alloc, 516 .map_free = fd_array_map_free, 517 .map_get_next_key = array_map_get_next_key, 518 .map_lookup_elem = fd_array_map_lookup_elem, 519 .map_delete_elem = fd_array_map_delete_elem, 520 .map_fd_get_ptr = perf_event_fd_array_get_ptr, 521 .map_fd_put_ptr = perf_event_fd_array_put_ptr, 522 .map_release = perf_event_fd_array_release, 523 }; 524 525 static struct bpf_map_type_list perf_event_array_type __ro_after_init = { 526 .ops = &perf_event_array_ops, 527 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, 528 }; 529 530 static int __init register_perf_event_array_map(void) 531 { 532 bpf_register_map_type(&perf_event_array_type); 533 return 0; 534 } 535 late_initcall(register_perf_event_array_map); 536 537 #ifdef CONFIG_CGROUPS 538 static void *cgroup_fd_array_get_ptr(struct bpf_map *map, 539 struct file *map_file /* not used */, 540 int fd) 541 { 542 return cgroup_get_from_fd(fd); 543 } 544 545 static void cgroup_fd_array_put_ptr(void *ptr) 546 { 547 /* cgroup_put free cgrp after a rcu grace period */ 548 cgroup_put(ptr); 549 } 550 551 static void cgroup_fd_array_free(struct bpf_map *map) 552 { 553 bpf_fd_array_map_clear(map); 554 fd_array_map_free(map); 555 } 556 557 static const struct bpf_map_ops cgroup_array_ops = { 558 .map_alloc = fd_array_map_alloc, 559 .map_free = cgroup_fd_array_free, 560 .map_get_next_key = array_map_get_next_key, 561 .map_lookup_elem = fd_array_map_lookup_elem, 562 .map_delete_elem = fd_array_map_delete_elem, 563 .map_fd_get_ptr = cgroup_fd_array_get_ptr, 564 .map_fd_put_ptr = cgroup_fd_array_put_ptr, 565 }; 566 567 static struct bpf_map_type_list cgroup_array_type __ro_after_init = { 568 .ops = &cgroup_array_ops, 569 .type = BPF_MAP_TYPE_CGROUP_ARRAY, 570 }; 571 572 static int __init register_cgroup_array_map(void) 573 { 574 bpf_register_map_type(&cgroup_array_type); 575 return 0; 576 } 577 late_initcall(register_cgroup_array_map); 578 #endif 579