1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 4 #include <linux/bpf.h> 5 #include <linux/filter.h> 6 #include <linux/bpf_mem_alloc.h> 7 #include <linux/percpu.h> 8 #include <linux/refcount.h> 9 #include <linux/gfp.h> 10 #include <linux/memory.h> 11 #include <linux/local_lock.h> 12 #include <linux/mutex.h> 13 14 /* 15 * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe 16 * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and 17 * stash it in a local per-CPU variable, and bump allocate from the page 18 * whenever items need to be printed to a stream. Each page holds a global 19 * atomic refcount in its first 4 bytes, and then records of variable length 20 * that describe the printed messages. Once the global refcount has dropped to 21 * zero, it is a signal to free the page back to the kernel's page allocator, 22 * given all the individual records in it have been consumed. 23 * 24 * It is possible the same page is used to serve allocations across different 25 * programs, which may be consumed at different times individually, hence 26 * maintaining a reference count per-page is critical for correct lifetime 27 * tracking. 28 * 29 * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it 30 * lands. 31 */ 32 struct bpf_stream_page { 33 refcount_t ref; 34 u32 consumed; 35 char buf[]; 36 }; 37 38 /* Available room to add data to a refcounted page. */ 39 #define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) 40 41 static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); 42 static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); 43 44 static bool bpf_stream_page_local_lock(unsigned long *flags) 45 { 46 return local_trylock_irqsave(&stream_local_lock, *flags); 47 } 48 49 static void bpf_stream_page_local_unlock(unsigned long *flags) 50 { 51 local_unlock_irqrestore(&stream_local_lock, *flags); 52 } 53 54 static void bpf_stream_page_free(struct bpf_stream_page *stream_page) 55 { 56 struct page *p; 57 58 if (!stream_page) 59 return; 60 p = virt_to_page(stream_page); 61 free_pages_nolock(p, 0); 62 } 63 64 static void bpf_stream_page_get(struct bpf_stream_page *stream_page) 65 { 66 refcount_inc(&stream_page->ref); 67 } 68 69 static void bpf_stream_page_put(struct bpf_stream_page *stream_page) 70 { 71 if (refcount_dec_and_test(&stream_page->ref)) 72 bpf_stream_page_free(stream_page); 73 } 74 75 static void bpf_stream_page_init(struct bpf_stream_page *stream_page) 76 { 77 refcount_set(&stream_page->ref, 1); 78 stream_page->consumed = 0; 79 } 80 81 static struct bpf_stream_page *bpf_stream_page_replace(void) 82 { 83 struct bpf_stream_page *stream_page, *old_stream_page; 84 struct page *page; 85 86 page = alloc_pages_nolock(NUMA_NO_NODE, 0); 87 if (!page) 88 return NULL; 89 stream_page = page_address(page); 90 bpf_stream_page_init(stream_page); 91 92 old_stream_page = this_cpu_read(stream_pcpu_page); 93 if (old_stream_page) 94 bpf_stream_page_put(old_stream_page); 95 this_cpu_write(stream_pcpu_page, stream_page); 96 return stream_page; 97 } 98 99 static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) 100 { 101 int min = offsetof(struct bpf_stream_elem, str[0]); 102 int consumed = stream_page->consumed; 103 int total = BPF_STREAM_PAGE_SZ; 104 int rem = max(0, total - consumed - min); 105 106 /* Let's give room of at least 8 bytes. */ 107 WARN_ON_ONCE(rem % 8 != 0); 108 rem = rem < 8 ? 0 : rem; 109 return min(len, rem); 110 } 111 112 static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) 113 { 114 init_llist_node(&elem->node); 115 elem->total_len = len; 116 elem->consumed_len = 0; 117 } 118 119 static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) 120 { 121 unsigned long addr = (unsigned long)elem; 122 123 return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); 124 } 125 126 static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) 127 { 128 u32 consumed = stream_page->consumed; 129 130 stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); 131 return (struct bpf_stream_elem *)&stream_page->buf[consumed]; 132 } 133 134 static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) 135 { 136 struct bpf_stream_elem *elem = NULL; 137 struct bpf_stream_page *page; 138 int room = 0; 139 140 page = this_cpu_read(stream_pcpu_page); 141 if (!page) 142 page = bpf_stream_page_replace(); 143 if (!page) 144 return NULL; 145 146 room = bpf_stream_page_check_room(page, len); 147 if (room != len) 148 page = bpf_stream_page_replace(); 149 if (!page) 150 return NULL; 151 bpf_stream_page_get(page); 152 room = bpf_stream_page_check_room(page, len); 153 WARN_ON_ONCE(room != len); 154 155 elem = bpf_stream_page_push_elem(page, room); 156 bpf_stream_elem_init(elem, room); 157 return elem; 158 } 159 160 static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) 161 { 162 const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); 163 struct bpf_stream_elem *elem; 164 unsigned long flags; 165 166 BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); 167 /* 168 * Length denotes the amount of data to be written as part of stream element, 169 * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can 170 * accomodate, therefore deny allocations that won't fit into them. 171 */ 172 if (len < 0 || len > max_len) 173 return NULL; 174 175 if (!bpf_stream_page_local_lock(&flags)) 176 return NULL; 177 elem = bpf_stream_page_reserve_elem(len); 178 bpf_stream_page_local_unlock(&flags); 179 return elem; 180 } 181 182 static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) 183 { 184 struct bpf_stream_elem *elem = NULL; 185 186 /* 187 * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream 188 * log, elements will be popped at once and reversed to print the log. 189 */ 190 elem = bpf_stream_elem_alloc(len); 191 if (!elem) 192 return -ENOMEM; 193 194 memcpy(elem->str, str, len); 195 llist_add(&elem->node, log); 196 197 return 0; 198 } 199 200 static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) 201 { 202 if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) 203 return -ENOSPC; 204 if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { 205 atomic_sub(len, &stream->capacity); 206 return -ENOSPC; 207 } 208 return 0; 209 } 210 211 static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) 212 { 213 int len = elem->total_len; 214 215 atomic_sub(len, &stream->capacity); 216 } 217 218 static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) 219 { 220 int ret = bpf_stream_consume_capacity(stream, len); 221 222 return ret ?: __bpf_stream_push_str(&stream->log, str, len); 223 } 224 225 static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) 226 { 227 if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) 228 return NULL; 229 return &aux->stream[stream_id - 1]; 230 } 231 232 static void bpf_stream_free_elem(struct bpf_stream_elem *elem) 233 { 234 struct bpf_stream_page *p; 235 236 p = bpf_stream_page_from_elem(elem); 237 bpf_stream_page_put(p); 238 } 239 240 static void bpf_stream_free_list(struct llist_node *list) 241 { 242 struct bpf_stream_elem *elem, *tmp; 243 244 llist_for_each_entry_safe(elem, tmp, list, node) 245 bpf_stream_free_elem(elem); 246 } 247 248 static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) 249 { 250 return stream->backlog_head; 251 } 252 253 static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) 254 { 255 struct llist_node *node; 256 257 node = stream->backlog_head; 258 if (stream->backlog_head == stream->backlog_tail) 259 stream->backlog_head = stream->backlog_tail = NULL; 260 else 261 stream->backlog_head = node->next; 262 return node; 263 } 264 265 static void bpf_stream_backlog_fill(struct bpf_stream *stream) 266 { 267 struct llist_node *head, *tail; 268 269 if (llist_empty(&stream->log)) 270 return; 271 tail = llist_del_all(&stream->log); 272 if (!tail) 273 return; 274 head = llist_reverse_order(tail); 275 276 if (!stream->backlog_head) { 277 stream->backlog_head = head; 278 stream->backlog_tail = tail; 279 } else { 280 stream->backlog_tail->next = head; 281 stream->backlog_tail = tail; 282 } 283 284 return; 285 } 286 287 static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) 288 { 289 int rem = elem->total_len - elem->consumed_len; 290 int used = min(rem, *len); 291 292 elem->consumed_len += used; 293 *len -= used; 294 295 return elem->consumed_len == elem->total_len; 296 } 297 298 static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) 299 { 300 int rem_len = len, cons_len, ret = 0; 301 struct bpf_stream_elem *elem = NULL; 302 struct llist_node *node; 303 304 mutex_lock(&stream->lock); 305 306 while (rem_len) { 307 int pos = len - rem_len; 308 bool cont; 309 310 node = bpf_stream_backlog_peek(stream); 311 if (!node) { 312 bpf_stream_backlog_fill(stream); 313 node = bpf_stream_backlog_peek(stream); 314 } 315 if (!node) 316 break; 317 elem = container_of(node, typeof(*elem), node); 318 319 cons_len = elem->consumed_len; 320 cont = bpf_stream_consume_elem(elem, &rem_len) == false; 321 322 ret = copy_to_user(buf + pos, elem->str + cons_len, 323 elem->consumed_len - cons_len); 324 /* Restore in case of error. */ 325 if (ret) { 326 ret = -EFAULT; 327 elem->consumed_len = cons_len; 328 break; 329 } 330 331 if (cont) 332 continue; 333 bpf_stream_backlog_pop(stream); 334 bpf_stream_release_capacity(stream, elem); 335 bpf_stream_free_elem(elem); 336 } 337 338 mutex_unlock(&stream->lock); 339 return ret ? ret : len - rem_len; 340 } 341 342 int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) 343 { 344 struct bpf_stream *stream; 345 346 stream = bpf_stream_get(stream_id, prog->aux); 347 if (!stream) 348 return -ENOENT; 349 return bpf_stream_read(stream, buf, len); 350 } 351 352 __bpf_kfunc_start_defs(); 353 354 /* 355 * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the 356 * enum in headers. 357 */ 358 __bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) 359 { 360 struct bpf_bprintf_data data = { 361 .get_bin_args = true, 362 .get_buf = true, 363 }; 364 struct bpf_prog_aux *aux = aux__prog; 365 u32 fmt_size = strlen(fmt__str) + 1; 366 struct bpf_stream *stream; 367 u32 data_len = len__sz; 368 int ret, num_args; 369 370 stream = bpf_stream_get(stream_id, aux); 371 if (!stream) 372 return -ENOENT; 373 374 if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || 375 (data_len && !args)) 376 return -EINVAL; 377 num_args = data_len / 8; 378 379 ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); 380 if (ret < 0) 381 return ret; 382 383 ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); 384 /* Exclude NULL byte during push. */ 385 ret = bpf_stream_push_str(stream, data.buf, ret); 386 bpf_bprintf_cleanup(&data); 387 388 return ret; 389 } 390 391 __bpf_kfunc_end_defs(); 392 393 /* Added kfunc to common_btf_ids */ 394 395 void bpf_prog_stream_init(struct bpf_prog *prog) 396 { 397 int i; 398 399 for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { 400 atomic_set(&prog->aux->stream[i].capacity, 0); 401 init_llist_head(&prog->aux->stream[i].log); 402 mutex_init(&prog->aux->stream[i].lock); 403 prog->aux->stream[i].backlog_head = NULL; 404 prog->aux->stream[i].backlog_tail = NULL; 405 } 406 } 407 408 void bpf_prog_stream_free(struct bpf_prog *prog) 409 { 410 struct llist_node *list; 411 int i; 412 413 for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { 414 list = llist_del_all(&prog->aux->stream[i].log); 415 bpf_stream_free_list(list); 416 bpf_stream_free_list(prog->aux->stream[i].backlog_head); 417 } 418 } 419 420 void bpf_stream_stage_init(struct bpf_stream_stage *ss) 421 { 422 init_llist_head(&ss->log); 423 ss->len = 0; 424 } 425 426 void bpf_stream_stage_free(struct bpf_stream_stage *ss) 427 { 428 struct llist_node *node; 429 430 node = llist_del_all(&ss->log); 431 bpf_stream_free_list(node); 432 } 433 434 int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) 435 { 436 struct bpf_bprintf_buffers *buf; 437 va_list args; 438 int ret; 439 440 if (bpf_try_get_buffers(&buf)) 441 return -EBUSY; 442 443 va_start(args, fmt); 444 ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); 445 va_end(args); 446 ss->len += ret; 447 /* Exclude NULL byte during push. */ 448 ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); 449 bpf_put_buffers(); 450 return ret; 451 } 452 453 int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, 454 enum bpf_stream_id stream_id) 455 { 456 struct llist_node *list, *head, *tail; 457 struct bpf_stream *stream; 458 int ret; 459 460 stream = bpf_stream_get(stream_id, prog->aux); 461 if (!stream) 462 return -EINVAL; 463 464 ret = bpf_stream_consume_capacity(stream, ss->len); 465 if (ret) 466 return ret; 467 468 list = llist_del_all(&ss->log); 469 head = tail = list; 470 471 if (!list) 472 return 0; 473 while (llist_next(list)) { 474 tail = llist_next(list); 475 list = tail; 476 } 477 llist_add_batch(head, tail, &stream->log); 478 return 0; 479 } 480 481 struct dump_stack_ctx { 482 struct bpf_stream_stage *ss; 483 int err; 484 }; 485 486 static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) 487 { 488 struct dump_stack_ctx *ctxp = cookie; 489 const char *file = "", *line = ""; 490 struct bpf_prog *prog; 491 int num, ret; 492 493 rcu_read_lock(); 494 prog = bpf_prog_ksym_find(ip); 495 rcu_read_unlock(); 496 if (prog) { 497 ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num); 498 if (ret < 0) 499 goto end; 500 ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", 501 (void *)(long)ip, line, file, num); 502 return !ctxp->err; 503 } 504 end: 505 ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip); 506 return !ctxp->err; 507 } 508 509 int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) 510 { 511 struct dump_stack_ctx ctx = { .ss = ss }; 512 int ret; 513 514 ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", 515 raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), 516 current->pid, current->comm); 517 if (ret) 518 return ret; 519 ret = bpf_stream_stage_printk(ss, "Call trace:\n"); 520 if (ret) 521 return ret; 522 arch_bpf_stack_walk(dump_stack_cb, &ctx); 523 if (ctx.err) 524 return ctx.err; 525 return bpf_stream_stage_printk(ss, "\n"); 526 } 527