1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Performance events ring-buffer code: 4 * 5 * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 7 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra 8 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 9 */ 10 11 #include <linux/perf_event.h> 12 #include <linux/vmalloc.h> 13 #include <linux/slab.h> 14 #include <linux/circ_buf.h> 15 #include <linux/poll.h> 16 #include <linux/nospec.h> 17 18 #include "internal.h" 19 20 static void perf_output_wakeup(struct perf_output_handle *handle) 21 { 22 atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM); 23 24 handle->event->pending_wakeup = 1; 25 26 if (*perf_event_fasync(handle->event) && !handle->event->pending_kill) 27 handle->event->pending_kill = POLL_IN; 28 29 irq_work_queue(&handle->event->pending_irq); 30 } 31 32 /* 33 * We need to ensure a later event_id doesn't publish a head when a former 34 * event isn't done writing. However since we need to deal with NMIs we 35 * cannot fully serialize things. 36 * 37 * We only publish the head (and generate a wakeup) when the outer-most 38 * event completes. 39 */ 40 static void perf_output_get_handle(struct perf_output_handle *handle) 41 { 42 struct perf_buffer *rb = handle->rb; 43 44 preempt_disable(); 45 46 /* 47 * Avoid an explicit LOAD/STORE such that architectures with memops 48 * can use them. 49 */ 50 (*(volatile unsigned int *)&rb->nest)++; 51 handle->wakeup = local_read(&rb->wakeup); 52 } 53 54 static void perf_output_put_handle(struct perf_output_handle *handle) 55 { 56 struct perf_buffer *rb = handle->rb; 57 unsigned long head; 58 unsigned int nest; 59 60 /* 61 * If this isn't the outermost nesting, we don't have to update 62 * @rb->user_page->data_head. 63 */ 64 nest = READ_ONCE(rb->nest); 65 if (nest > 1) { 66 WRITE_ONCE(rb->nest, nest - 1); 67 goto out; 68 } 69 70 again: 71 /* 72 * In order to avoid publishing a head value that goes backwards, 73 * we must ensure the load of @rb->head happens after we've 74 * incremented @rb->nest. 75 * 76 * Otherwise we can observe a @rb->head value before one published 77 * by an IRQ/NMI happening between the load and the increment. 78 */ 79 barrier(); 80 head = local_read(&rb->head); 81 82 /* 83 * IRQ/NMI can happen here and advance @rb->head, causing our 84 * load above to be stale. 85 */ 86 87 /* 88 * Since the mmap() consumer (userspace) can run on a different CPU: 89 * 90 * kernel user 91 * 92 * if (LOAD ->data_tail) { LOAD ->data_head 93 * (A) smp_rmb() (C) 94 * STORE $data LOAD $data 95 * smp_wmb() (B) smp_mb() (D) 96 * STORE ->data_head STORE ->data_tail 97 * } 98 * 99 * Where A pairs with D, and B pairs with C. 100 * 101 * In our case (A) is a control dependency that separates the load of 102 * the ->data_tail and the stores of $data. In case ->data_tail 103 * indicates there is no room in the buffer to store $data we do not. 104 * 105 * D needs to be a full barrier since it separates the data READ 106 * from the tail WRITE. 107 * 108 * For B a WMB is sufficient since it separates two WRITEs, and for C 109 * an RMB is sufficient since it separates two READs. 110 * 111 * See perf_output_begin(). 112 */ 113 smp_wmb(); /* B, matches C */ 114 WRITE_ONCE(rb->user_page->data_head, head); 115 116 /* 117 * We must publish the head before decrementing the nest count, 118 * otherwise an IRQ/NMI can publish a more recent head value and our 119 * write will (temporarily) publish a stale value. 120 */ 121 barrier(); 122 WRITE_ONCE(rb->nest, 0); 123 124 /* 125 * Ensure we decrement @rb->nest before we validate the @rb->head. 126 * Otherwise we cannot be sure we caught the 'last' nested update. 127 */ 128 barrier(); 129 if (unlikely(head != local_read(&rb->head))) { 130 WRITE_ONCE(rb->nest, 1); 131 goto again; 132 } 133 134 if (handle->wakeup != local_read(&rb->wakeup)) 135 perf_output_wakeup(handle); 136 137 out: 138 preempt_enable(); 139 } 140 141 static __always_inline bool 142 ring_buffer_has_space(unsigned long head, unsigned long tail, 143 unsigned long data_size, unsigned int size, 144 bool backward) 145 { 146 if (!backward) 147 return CIRC_SPACE(head, tail, data_size) >= size; 148 else 149 return CIRC_SPACE(tail, head, data_size) >= size; 150 } 151 152 static __always_inline int 153 __perf_output_begin(struct perf_output_handle *handle, 154 struct perf_sample_data *data, 155 struct perf_event *event, unsigned int size, 156 bool backward) 157 { 158 struct perf_buffer *rb; 159 unsigned long tail, offset, head; 160 int have_lost, page_shift; 161 struct { 162 struct perf_event_header header; 163 u64 id; 164 u64 lost; 165 } lost_event; 166 167 rcu_read_lock(); 168 /* 169 * For inherited events we send all the output towards the parent. 170 */ 171 if (event->parent) 172 event = event->parent; 173 174 rb = rcu_dereference(event->rb); 175 if (unlikely(!rb)) 176 goto out; 177 178 if (unlikely(rb->paused)) { 179 if (rb->nr_pages) { 180 local_inc(&rb->lost); 181 atomic64_inc(&event->lost_samples); 182 } 183 goto out; 184 } 185 186 handle->rb = rb; 187 handle->event = event; 188 handle->flags = 0; 189 190 have_lost = local_read(&rb->lost); 191 if (unlikely(have_lost)) { 192 size += sizeof(lost_event); 193 if (event->attr.sample_id_all) 194 size += event->id_header_size; 195 } 196 197 perf_output_get_handle(handle); 198 199 offset = local_read(&rb->head); 200 do { 201 head = offset; 202 tail = READ_ONCE(rb->user_page->data_tail); 203 if (!rb->overwrite) { 204 if (unlikely(!ring_buffer_has_space(head, tail, 205 perf_data_size(rb), 206 size, backward))) 207 goto fail; 208 } 209 210 /* 211 * The above forms a control dependency barrier separating the 212 * @tail load above from the data stores below. Since the @tail 213 * load is required to compute the branch to fail below. 214 * 215 * A, matches D; the full memory barrier userspace SHOULD issue 216 * after reading the data and before storing the new tail 217 * position. 218 * 219 * See perf_output_put_handle(). 220 */ 221 222 if (!backward) 223 head += size; 224 else 225 head -= size; 226 } while (!local_try_cmpxchg(&rb->head, &offset, head)); 227 228 if (backward) { 229 offset = head; 230 head = (u64)(-head); 231 } 232 233 /* 234 * We rely on the implied barrier() by local_cmpxchg() to ensure 235 * none of the data stores below can be lifted up by the compiler. 236 */ 237 238 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) 239 local_add(rb->watermark, &rb->wakeup); 240 241 page_shift = PAGE_SHIFT + page_order(rb); 242 243 handle->page = (offset >> page_shift) & (rb->nr_pages - 1); 244 offset &= (1UL << page_shift) - 1; 245 handle->addr = rb->data_pages[handle->page] + offset; 246 handle->size = (1UL << page_shift) - offset; 247 248 if (unlikely(have_lost)) { 249 lost_event.header.size = sizeof(lost_event); 250 lost_event.header.type = PERF_RECORD_LOST; 251 lost_event.header.misc = 0; 252 lost_event.id = event->id; 253 lost_event.lost = local_xchg(&rb->lost, 0); 254 255 /* XXX mostly redundant; @data is already fully initializes */ 256 perf_event_header__init_id(&lost_event.header, data, event); 257 perf_output_put(handle, lost_event); 258 perf_event__output_id_sample(event, handle, data); 259 } 260 261 return 0; 262 263 fail: 264 local_inc(&rb->lost); 265 atomic64_inc(&event->lost_samples); 266 perf_output_put_handle(handle); 267 out: 268 rcu_read_unlock(); 269 270 return -ENOSPC; 271 } 272 273 int perf_output_begin_forward(struct perf_output_handle *handle, 274 struct perf_sample_data *data, 275 struct perf_event *event, unsigned int size) 276 { 277 return __perf_output_begin(handle, data, event, size, false); 278 } 279 280 int perf_output_begin_backward(struct perf_output_handle *handle, 281 struct perf_sample_data *data, 282 struct perf_event *event, unsigned int size) 283 { 284 return __perf_output_begin(handle, data, event, size, true); 285 } 286 287 int perf_output_begin(struct perf_output_handle *handle, 288 struct perf_sample_data *data, 289 struct perf_event *event, unsigned int size) 290 { 291 292 return __perf_output_begin(handle, data, event, size, 293 unlikely(is_write_backward(event))); 294 } 295 296 unsigned int perf_output_copy(struct perf_output_handle *handle, 297 const void *buf, unsigned int len) 298 { 299 return __output_copy(handle, buf, len); 300 } 301 302 unsigned int perf_output_skip(struct perf_output_handle *handle, 303 unsigned int len) 304 { 305 return __output_skip(handle, NULL, len); 306 } 307 308 void perf_output_end(struct perf_output_handle *handle) 309 { 310 perf_output_put_handle(handle); 311 rcu_read_unlock(); 312 } 313 314 static void 315 ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) 316 { 317 long max_size = perf_data_size(rb); 318 319 if (watermark) 320 rb->watermark = min(max_size, watermark); 321 322 if (!rb->watermark) 323 rb->watermark = max_size / 2; 324 325 if (flags & RING_BUFFER_WRITABLE) 326 rb->overwrite = 0; 327 else 328 rb->overwrite = 1; 329 330 refcount_set(&rb->refcount, 1); 331 332 INIT_LIST_HEAD(&rb->event_list); 333 spin_lock_init(&rb->event_lock); 334 335 /* 336 * perf_output_begin() only checks rb->paused, therefore 337 * rb->paused must be true if we have no pages for output. 338 */ 339 if (!rb->nr_pages) 340 rb->paused = 1; 341 342 mutex_init(&rb->aux_mutex); 343 rb->mmap_user = get_current_user(); 344 refcount_set(&rb->mmap_count, 1); 345 } 346 347 void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) 348 { 349 /* 350 * OVERWRITE is determined by perf_aux_output_end() and can't 351 * be passed in directly. 352 */ 353 if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) 354 return; 355 356 handle->aux_flags |= flags; 357 } 358 EXPORT_SYMBOL_GPL(perf_aux_output_flag); 359 360 /* 361 * This is called before hardware starts writing to the AUX area to 362 * obtain an output handle and make sure there's room in the buffer. 363 * When the capture completes, call perf_aux_output_end() to commit 364 * the recorded data to the buffer. 365 * 366 * The ordering is similar to that of perf_output_{begin,end}, with 367 * the exception of (B), which should be taken care of by the pmu 368 * driver, since ordering rules will differ depending on hardware. 369 * 370 * Call this from pmu::start(); see the comment in perf_aux_output_end() 371 * about its use in pmu callbacks. Both can also be called from the PMI 372 * handler if needed. 373 */ 374 void *perf_aux_output_begin(struct perf_output_handle *handle, 375 struct perf_event *event) 376 { 377 struct perf_event *output_event = event; 378 unsigned long aux_head, aux_tail; 379 struct perf_buffer *rb; 380 unsigned int nest; 381 382 if (output_event->parent) 383 output_event = output_event->parent; 384 385 /* 386 * Since this will typically be open across pmu::add/pmu::del, we 387 * grab ring_buffer's refcount instead of holding rcu read lock 388 * to make sure it doesn't disappear under us. 389 */ 390 rb = ring_buffer_get(output_event); 391 if (!rb) 392 return NULL; 393 394 if (!rb_has_aux(rb)) 395 goto err; 396 397 /* 398 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), 399 * about to get freed, so we leave immediately. 400 * 401 * Checking rb::aux_mmap_count and rb::refcount has to be done in 402 * the same order, see perf_mmap_close. Otherwise we end up freeing 403 * aux pages in this path, which is a bug, because in_atomic(). 404 */ 405 if (!refcount_read(&rb->aux_mmap_count)) 406 goto err; 407 408 if (!refcount_inc_not_zero(&rb->aux_refcount)) 409 goto err; 410 411 nest = READ_ONCE(rb->aux_nest); 412 /* 413 * Nesting is not supported for AUX area, make sure nested 414 * writers are caught early 415 */ 416 if (WARN_ON_ONCE(nest)) 417 goto err_put; 418 419 WRITE_ONCE(rb->aux_nest, nest + 1); 420 421 aux_head = rb->aux_head; 422 423 handle->rb = rb; 424 handle->event = event; 425 handle->head = aux_head; 426 handle->size = 0; 427 handle->aux_flags = 0; 428 429 /* 430 * In overwrite mode, AUX data stores do not depend on aux_tail, 431 * therefore (A) control dependency barrier does not exist. The 432 * (B) <-> (C) ordering is still observed by the pmu driver. 433 */ 434 if (!rb->aux_overwrite) { 435 aux_tail = READ_ONCE(rb->user_page->aux_tail); 436 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 437 if (aux_head - aux_tail < perf_aux_size(rb)) 438 handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); 439 440 /* 441 * handle->size computation depends on aux_tail load; this forms a 442 * control dependency barrier separating aux_tail load from aux data 443 * store that will be enabled on successful return 444 */ 445 if (!handle->size) { /* A, matches D */ 446 perf_event_disable_inatomic(handle->event); 447 perf_output_wakeup(handle); 448 WRITE_ONCE(rb->aux_nest, 0); 449 goto err_put; 450 } 451 } 452 453 return handle->rb->aux_priv; 454 455 err_put: 456 /* can't be last */ 457 rb_free_aux(rb); 458 459 err: 460 ring_buffer_put(rb); 461 handle->event = NULL; 462 463 return NULL; 464 } 465 EXPORT_SYMBOL_GPL(perf_aux_output_begin); 466 467 static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb) 468 { 469 if (rb->aux_overwrite) 470 return false; 471 472 if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { 473 rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); 474 return true; 475 } 476 477 return false; 478 } 479 480 /* 481 * Commit the data written by hardware into the ring buffer by adjusting 482 * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the 483 * pmu driver's responsibility to observe ordering rules of the hardware, 484 * so that all the data is externally visible before this is called. 485 * 486 * Note: this has to be called from pmu::stop() callback, as the assumption 487 * of the AUX buffer management code is that after pmu::stop(), the AUX 488 * transaction must be stopped and therefore drop the AUX reference count. 489 */ 490 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) 491 { 492 bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); 493 struct perf_buffer *rb = handle->rb; 494 unsigned long aux_head; 495 496 /* in overwrite mode, driver provides aux_head via handle */ 497 if (rb->aux_overwrite) { 498 handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; 499 500 aux_head = handle->head; 501 rb->aux_head = aux_head; 502 } else { 503 handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; 504 505 aux_head = rb->aux_head; 506 rb->aux_head += size; 507 } 508 509 /* 510 * Only send RECORD_AUX if we have something useful to communicate 511 * 512 * Note: the OVERWRITE records by themselves are not considered 513 * useful, as they don't communicate any *new* information, 514 * aside from the short-lived offset, that becomes history at 515 * the next event sched-in and therefore isn't useful. 516 * The userspace that needs to copy out AUX data in overwrite 517 * mode should know to use user_page::aux_head for the actual 518 * offset. So, from now on we don't output AUX records that 519 * have *only* OVERWRITE flag set. 520 */ 521 if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) 522 perf_event_aux_event(handle->event, aux_head, size, 523 handle->aux_flags); 524 525 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); 526 if (rb_need_aux_wakeup(rb)) 527 wakeup = true; 528 529 if (wakeup) { 530 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) 531 perf_event_disable_inatomic(handle->event); 532 perf_output_wakeup(handle); 533 } 534 535 handle->event = NULL; 536 537 WRITE_ONCE(rb->aux_nest, 0); 538 /* can't be last */ 539 rb_free_aux(rb); 540 ring_buffer_put(rb); 541 } 542 EXPORT_SYMBOL_GPL(perf_aux_output_end); 543 544 /* 545 * Skip over a given number of bytes in the AUX buffer, due to, for example, 546 * hardware's alignment constraints. 547 */ 548 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) 549 { 550 struct perf_buffer *rb = handle->rb; 551 552 if (size > handle->size) 553 return -ENOSPC; 554 555 rb->aux_head += size; 556 557 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); 558 if (rb_need_aux_wakeup(rb)) { 559 perf_output_wakeup(handle); 560 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 561 } 562 563 handle->head = rb->aux_head; 564 handle->size -= size; 565 566 return 0; 567 } 568 EXPORT_SYMBOL_GPL(perf_aux_output_skip); 569 570 void *perf_get_aux(struct perf_output_handle *handle) 571 { 572 /* this is only valid between perf_aux_output_begin and *_end */ 573 if (!handle->event) 574 return NULL; 575 576 return handle->rb->aux_priv; 577 } 578 EXPORT_SYMBOL_GPL(perf_get_aux); 579 580 /* 581 * Copy out AUX data from an AUX handle. 582 */ 583 long perf_output_copy_aux(struct perf_output_handle *aux_handle, 584 struct perf_output_handle *handle, 585 unsigned long from, unsigned long to) 586 { 587 struct perf_buffer *rb = aux_handle->rb; 588 unsigned long tocopy, remainder, len = 0; 589 void *addr; 590 591 from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 592 to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 593 594 do { 595 tocopy = PAGE_SIZE - offset_in_page(from); 596 if (to > from) 597 tocopy = min(tocopy, to - from); 598 if (!tocopy) 599 break; 600 601 addr = rb->aux_pages[from >> PAGE_SHIFT]; 602 addr += offset_in_page(from); 603 604 remainder = perf_output_copy(handle, addr, tocopy); 605 if (remainder) 606 return -EFAULT; 607 608 len += tocopy; 609 from += tocopy; 610 from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; 611 } while (to != from); 612 613 return len; 614 } 615 616 #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) 617 618 static struct page *rb_alloc_aux_page(int node, int order) 619 { 620 struct page *page; 621 622 if (order > MAX_PAGE_ORDER) 623 order = MAX_PAGE_ORDER; 624 625 do { 626 page = alloc_pages_node(node, PERF_AUX_GFP, order); 627 } while (!page && order--); 628 629 if (page && order) { 630 /* 631 * Communicate the allocation size to the driver: 632 * if we managed to secure a high-order allocation, 633 * set its first page's private to this order; 634 * !PagePrivate(page) means it's just a normal page. 635 */ 636 split_page(page, order); 637 SetPagePrivate(page); 638 set_page_private(page, order); 639 } 640 641 return page; 642 } 643 644 static void rb_free_aux_page(struct perf_buffer *rb, int idx) 645 { 646 struct page *page = virt_to_page(rb->aux_pages[idx]); 647 648 ClearPagePrivate(page); 649 __free_page(page); 650 } 651 652 static void __rb_free_aux(struct perf_buffer *rb) 653 { 654 int pg; 655 656 /* 657 * Should never happen, the last reference should be dropped from 658 * perf_mmap_close() path, which first stops aux transactions (which 659 * in turn are the atomic holders of aux_refcount) and then does the 660 * last rb_free_aux(). 661 */ 662 WARN_ON_ONCE(in_atomic()); 663 664 if (rb->aux_priv) { 665 rb->free_aux(rb->aux_priv); 666 rb->free_aux = NULL; 667 rb->aux_priv = NULL; 668 } 669 670 if (rb->aux_nr_pages) { 671 for (pg = 0; pg < rb->aux_nr_pages; pg++) 672 rb_free_aux_page(rb, pg); 673 674 kfree(rb->aux_pages); 675 rb->aux_nr_pages = 0; 676 } 677 } 678 679 int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, 680 pgoff_t pgoff, int nr_pages, long watermark, int flags) 681 { 682 bool overwrite = !(flags & RING_BUFFER_WRITABLE); 683 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); 684 bool use_contiguous_pages = event->pmu->capabilities & ( 685 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); 686 /* 687 * Initialize max_order to 0 for page allocation. This allocates single 688 * pages to minimize memory fragmentation. This is overridden if the 689 * PMU needs or prefers contiguous pages (use_contiguous_pages = true). 690 */ 691 int max_order = 0; 692 int ret = -ENOMEM; 693 694 if (!has_aux(event)) 695 return -EOPNOTSUPP; 696 697 if (nr_pages <= 0) 698 return -EINVAL; 699 700 if (!overwrite) { 701 /* 702 * Watermark defaults to half the buffer, to aid PMU drivers 703 * in double buffering. 704 */ 705 if (!watermark) 706 watermark = min_t(unsigned long, 707 U32_MAX, 708 (unsigned long)nr_pages << (PAGE_SHIFT - 1)); 709 710 /* 711 * If using contiguous pages, use aux_watermark as the basis 712 * for chunking to help PMU drivers honor the watermark. 713 */ 714 if (use_contiguous_pages) 715 max_order = get_order(watermark); 716 } else { 717 /* 718 * If using contiguous pages, we need to start with the 719 * max_order that fits in nr_pages, not the other way around, 720 * hence ilog2() and not get_order. 721 */ 722 if (use_contiguous_pages) 723 max_order = ilog2(nr_pages); 724 watermark = 0; 725 } 726 727 /* 728 * kcalloc_node() is unable to allocate buffer if the size is larger 729 * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case. 730 */ 731 if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) 732 return -ENOMEM; 733 rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, 734 node); 735 if (!rb->aux_pages) 736 return -ENOMEM; 737 738 rb->free_aux = event->pmu->free_aux; 739 for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { 740 struct page *page; 741 int last, order; 742 743 order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); 744 page = rb_alloc_aux_page(node, order); 745 if (!page) 746 goto out; 747 748 for (last = rb->aux_nr_pages + (1 << page_private(page)); 749 last > rb->aux_nr_pages; rb->aux_nr_pages++) 750 rb->aux_pages[rb->aux_nr_pages] = page_address(page++); 751 } 752 753 /* 754 * In overwrite mode, PMUs that don't support SG may not handle more 755 * than one contiguous allocation, since they rely on PMI to do double 756 * buffering. In this case, the entire buffer has to be one contiguous 757 * chunk. 758 */ 759 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && 760 overwrite) { 761 struct page *page = virt_to_page(rb->aux_pages[0]); 762 763 if (page_private(page) != max_order) 764 goto out; 765 } 766 767 rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, 768 overwrite); 769 if (!rb->aux_priv) 770 goto out; 771 772 ret = 0; 773 774 /* 775 * aux_pages (and pmu driver's private data, aux_priv) will be 776 * referenced in both producer's and consumer's contexts, thus 777 * we keep a refcount here to make sure either of the two can 778 * reference them safely. 779 */ 780 refcount_set(&rb->aux_refcount, 1); 781 782 rb->aux_overwrite = overwrite; 783 rb->aux_watermark = watermark; 784 785 out: 786 if (!ret) 787 rb->aux_pgoff = pgoff; 788 else 789 __rb_free_aux(rb); 790 791 return ret; 792 } 793 794 void rb_free_aux(struct perf_buffer *rb) 795 { 796 if (refcount_dec_and_test(&rb->aux_refcount)) 797 __rb_free_aux(rb); 798 } 799 800 #ifndef CONFIG_PERF_USE_VMALLOC 801 802 /* 803 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 804 */ 805 806 static struct page * 807 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 808 { 809 if (pgoff > rb->nr_pages) 810 return NULL; 811 812 if (pgoff == 0) 813 return virt_to_page(rb->user_page); 814 815 return virt_to_page(rb->data_pages[pgoff - 1]); 816 } 817 818 static void *perf_mmap_alloc_page(int cpu) 819 { 820 struct page *page; 821 int node; 822 823 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 824 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 825 if (!page) 826 return NULL; 827 828 return page_address(page); 829 } 830 831 static void perf_mmap_free_page(void *addr) 832 { 833 struct page *page = virt_to_page(addr); 834 835 __free_page(page); 836 } 837 838 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 839 { 840 struct perf_buffer *rb; 841 unsigned long size; 842 int i, node; 843 844 size = sizeof(struct perf_buffer); 845 size += nr_pages * sizeof(void *); 846 847 if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) 848 goto fail; 849 850 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 851 rb = kzalloc_node(size, GFP_KERNEL, node); 852 if (!rb) 853 goto fail; 854 855 rb->user_page = perf_mmap_alloc_page(cpu); 856 if (!rb->user_page) 857 goto fail_user_page; 858 859 for (i = 0; i < nr_pages; i++) { 860 rb->data_pages[i] = perf_mmap_alloc_page(cpu); 861 if (!rb->data_pages[i]) 862 goto fail_data_pages; 863 } 864 865 rb->nr_pages = nr_pages; 866 867 ring_buffer_init(rb, watermark, flags); 868 869 return rb; 870 871 fail_data_pages: 872 for (i--; i >= 0; i--) 873 perf_mmap_free_page(rb->data_pages[i]); 874 875 perf_mmap_free_page(rb->user_page); 876 877 fail_user_page: 878 kfree(rb); 879 880 fail: 881 return NULL; 882 } 883 884 void rb_free(struct perf_buffer *rb) 885 { 886 int i; 887 888 perf_mmap_free_page(rb->user_page); 889 for (i = 0; i < rb->nr_pages; i++) 890 perf_mmap_free_page(rb->data_pages[i]); 891 kfree(rb); 892 } 893 894 #else 895 static struct page * 896 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 897 { 898 /* The '>' counts in the user page. */ 899 if (pgoff > data_page_nr(rb)) 900 return NULL; 901 902 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 903 } 904 905 static void rb_free_work(struct work_struct *work) 906 { 907 struct perf_buffer *rb; 908 909 rb = container_of(work, struct perf_buffer, work); 910 911 vfree(rb->user_page); 912 kfree(rb); 913 } 914 915 void rb_free(struct perf_buffer *rb) 916 { 917 schedule_work(&rb->work); 918 } 919 920 struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 921 { 922 struct perf_buffer *rb; 923 unsigned long size; 924 void *all_buf; 925 int node; 926 927 size = sizeof(struct perf_buffer); 928 size += sizeof(void *); 929 930 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 931 rb = kzalloc_node(size, GFP_KERNEL, node); 932 if (!rb) 933 goto fail; 934 935 INIT_WORK(&rb->work, rb_free_work); 936 937 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 938 if (!all_buf) 939 goto fail_all_buf; 940 941 rb->user_page = all_buf; 942 rb->data_pages[0] = all_buf + PAGE_SIZE; 943 if (nr_pages) { 944 rb->nr_pages = 1; 945 rb->page_order = ilog2(nr_pages); 946 } 947 948 ring_buffer_init(rb, watermark, flags); 949 950 return rb; 951 952 fail_all_buf: 953 kfree(rb); 954 955 fail: 956 return NULL; 957 } 958 959 #endif 960 961 struct page * 962 perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) 963 { 964 if (rb->aux_nr_pages) { 965 /* above AUX space */ 966 if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) 967 return NULL; 968 969 /* AUX space */ 970 if (pgoff >= rb->aux_pgoff) { 971 int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); 972 return virt_to_page(rb->aux_pages[aux_pgoff]); 973 } 974 } 975 976 return __perf_mmap_to_page(rb, pgoff); 977 } 978