1 /* 2 * Generic ring buffer 3 * 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 5 */ 6 #include <linux/ring_buffer.h> 7 #include <linux/spinlock.h> 8 #include <linux/debugfs.h> 9 #include <linux/uaccess.h> 10 #include <linux/module.h> 11 #include <linux/percpu.h> 12 #include <linux/mutex.h> 13 #include <linux/sched.h> /* used for sched_clock() (for now) */ 14 #include <linux/init.h> 15 #include <linux/hash.h> 16 #include <linux/list.h> 17 #include <linux/fs.h> 18 19 #include "trace.h" 20 21 /* Global flag to disable all recording to ring buffers */ 22 static int ring_buffers_off __read_mostly; 23 24 /** 25 * tracing_on - enable all tracing buffers 26 * 27 * This function enables all tracing buffers that may have been 28 * disabled with tracing_off. 29 */ 30 void tracing_on(void) 31 { 32 ring_buffers_off = 0; 33 } 34 35 /** 36 * tracing_off - turn off all tracing buffers 37 * 38 * This function stops all tracing buffers from recording data. 39 * It does not disable any overhead the tracers themselves may 40 * be causing. This function simply causes all recording to 41 * the ring buffers to fail. 42 */ 43 void tracing_off(void) 44 { 45 ring_buffers_off = 1; 46 } 47 48 /* Up this if you want to test the TIME_EXTENTS and normalization */ 49 #define DEBUG_SHIFT 0 50 51 /* FIXME!!! */ 52 u64 ring_buffer_time_stamp(int cpu) 53 { 54 u64 time; 55 56 preempt_disable_notrace(); 57 /* shift to debug/test normalization and TIME_EXTENTS */ 58 time = sched_clock() << DEBUG_SHIFT; 59 preempt_enable_notrace(); 60 61 return time; 62 } 63 64 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 65 { 66 /* Just stupid testing the normalize function and deltas */ 67 *ts >>= DEBUG_SHIFT; 68 } 69 70 #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 71 #define RB_ALIGNMENT_SHIFT 2 72 #define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) 73 #define RB_MAX_SMALL_DATA 28 74 75 enum { 76 RB_LEN_TIME_EXTEND = 8, 77 RB_LEN_TIME_STAMP = 16, 78 }; 79 80 /* inline for ring buffer fast paths */ 81 static inline unsigned 82 rb_event_length(struct ring_buffer_event *event) 83 { 84 unsigned length; 85 86 switch (event->type) { 87 case RINGBUF_TYPE_PADDING: 88 /* undefined */ 89 return -1; 90 91 case RINGBUF_TYPE_TIME_EXTEND: 92 return RB_LEN_TIME_EXTEND; 93 94 case RINGBUF_TYPE_TIME_STAMP: 95 return RB_LEN_TIME_STAMP; 96 97 case RINGBUF_TYPE_DATA: 98 if (event->len) 99 length = event->len << RB_ALIGNMENT_SHIFT; 100 else 101 length = event->array[0]; 102 return length + RB_EVNT_HDR_SIZE; 103 default: 104 BUG(); 105 } 106 /* not hit */ 107 return 0; 108 } 109 110 /** 111 * ring_buffer_event_length - return the length of the event 112 * @event: the event to get the length of 113 */ 114 unsigned ring_buffer_event_length(struct ring_buffer_event *event) 115 { 116 return rb_event_length(event); 117 } 118 119 /* inline for ring buffer fast paths */ 120 static inline void * 121 rb_event_data(struct ring_buffer_event *event) 122 { 123 BUG_ON(event->type != RINGBUF_TYPE_DATA); 124 /* If length is in len field, then array[0] has the data */ 125 if (event->len) 126 return (void *)&event->array[0]; 127 /* Otherwise length is in array[0] and array[1] has the data */ 128 return (void *)&event->array[1]; 129 } 130 131 /** 132 * ring_buffer_event_data - return the data of the event 133 * @event: the event to get the data from 134 */ 135 void *ring_buffer_event_data(struct ring_buffer_event *event) 136 { 137 return rb_event_data(event); 138 } 139 140 #define for_each_buffer_cpu(buffer, cpu) \ 141 for_each_cpu_mask(cpu, buffer->cpumask) 142 143 #define TS_SHIFT 27 144 #define TS_MASK ((1ULL << TS_SHIFT) - 1) 145 #define TS_DELTA_TEST (~TS_MASK) 146 147 /* 148 * This hack stolen from mm/slob.c. 149 * We can store per page timing information in the page frame of the page. 150 * Thanks to Peter Zijlstra for suggesting this idea. 151 */ 152 struct buffer_page { 153 u64 time_stamp; /* page time stamp */ 154 local_t write; /* index for next write */ 155 local_t commit; /* write commited index */ 156 unsigned read; /* index for next read */ 157 struct list_head list; /* list of free pages */ 158 void *page; /* Actual data page */ 159 }; 160 161 /* 162 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 163 * this issue out. 164 */ 165 static inline void free_buffer_page(struct buffer_page *bpage) 166 { 167 if (bpage->page) 168 free_page((unsigned long)bpage->page); 169 kfree(bpage); 170 } 171 172 /* 173 * We need to fit the time_stamp delta into 27 bits. 174 */ 175 static inline int test_time_stamp(u64 delta) 176 { 177 if (delta & TS_DELTA_TEST) 178 return 1; 179 return 0; 180 } 181 182 #define BUF_PAGE_SIZE PAGE_SIZE 183 184 /* 185 * head_page == tail_page && head == tail then buffer is empty. 186 */ 187 struct ring_buffer_per_cpu { 188 int cpu; 189 struct ring_buffer *buffer; 190 spinlock_t lock; 191 struct lock_class_key lock_key; 192 struct list_head pages; 193 struct buffer_page *head_page; /* read from head */ 194 struct buffer_page *tail_page; /* write to tail */ 195 struct buffer_page *commit_page; /* commited pages */ 196 struct buffer_page *reader_page; 197 unsigned long overrun; 198 unsigned long entries; 199 u64 write_stamp; 200 u64 read_stamp; 201 atomic_t record_disabled; 202 }; 203 204 struct ring_buffer { 205 unsigned long size; 206 unsigned pages; 207 unsigned flags; 208 int cpus; 209 cpumask_t cpumask; 210 atomic_t record_disabled; 211 212 struct mutex mutex; 213 214 struct ring_buffer_per_cpu **buffers; 215 }; 216 217 struct ring_buffer_iter { 218 struct ring_buffer_per_cpu *cpu_buffer; 219 unsigned long head; 220 struct buffer_page *head_page; 221 u64 read_stamp; 222 }; 223 224 #define RB_WARN_ON(buffer, cond) \ 225 do { \ 226 if (unlikely(cond)) { \ 227 atomic_inc(&buffer->record_disabled); \ 228 WARN_ON(1); \ 229 } \ 230 } while (0) 231 232 #define RB_WARN_ON_RET(buffer, cond) \ 233 do { \ 234 if (unlikely(cond)) { \ 235 atomic_inc(&buffer->record_disabled); \ 236 WARN_ON(1); \ 237 return -1; \ 238 } \ 239 } while (0) 240 241 #define RB_WARN_ON_ONCE(buffer, cond) \ 242 do { \ 243 static int once; \ 244 if (unlikely(cond) && !once) { \ 245 once++; \ 246 atomic_inc(&buffer->record_disabled); \ 247 WARN_ON(1); \ 248 } \ 249 } while (0) 250 251 /** 252 * check_pages - integrity check of buffer pages 253 * @cpu_buffer: CPU buffer with pages to test 254 * 255 * As a safty measure we check to make sure the data pages have not 256 * been corrupted. 257 */ 258 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 259 { 260 struct list_head *head = &cpu_buffer->pages; 261 struct buffer_page *page, *tmp; 262 263 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 264 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 265 266 list_for_each_entry_safe(page, tmp, head, list) { 267 RB_WARN_ON_RET(cpu_buffer, 268 page->list.next->prev != &page->list); 269 RB_WARN_ON_RET(cpu_buffer, 270 page->list.prev->next != &page->list); 271 } 272 273 return 0; 274 } 275 276 static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 277 unsigned nr_pages) 278 { 279 struct list_head *head = &cpu_buffer->pages; 280 struct buffer_page *page, *tmp; 281 unsigned long addr; 282 LIST_HEAD(pages); 283 unsigned i; 284 285 for (i = 0; i < nr_pages; i++) { 286 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 287 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 288 if (!page) 289 goto free_pages; 290 list_add(&page->list, &pages); 291 292 addr = __get_free_page(GFP_KERNEL); 293 if (!addr) 294 goto free_pages; 295 page->page = (void *)addr; 296 } 297 298 list_splice(&pages, head); 299 300 rb_check_pages(cpu_buffer); 301 302 return 0; 303 304 free_pages: 305 list_for_each_entry_safe(page, tmp, &pages, list) { 306 list_del_init(&page->list); 307 free_buffer_page(page); 308 } 309 return -ENOMEM; 310 } 311 312 static struct ring_buffer_per_cpu * 313 rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 314 { 315 struct ring_buffer_per_cpu *cpu_buffer; 316 struct buffer_page *page; 317 unsigned long addr; 318 int ret; 319 320 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 321 GFP_KERNEL, cpu_to_node(cpu)); 322 if (!cpu_buffer) 323 return NULL; 324 325 cpu_buffer->cpu = cpu; 326 cpu_buffer->buffer = buffer; 327 spin_lock_init(&cpu_buffer->lock); 328 INIT_LIST_HEAD(&cpu_buffer->pages); 329 330 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 331 GFP_KERNEL, cpu_to_node(cpu)); 332 if (!page) 333 goto fail_free_buffer; 334 335 cpu_buffer->reader_page = page; 336 addr = __get_free_page(GFP_KERNEL); 337 if (!addr) 338 goto fail_free_reader; 339 page->page = (void *)addr; 340 341 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 342 343 ret = rb_allocate_pages(cpu_buffer, buffer->pages); 344 if (ret < 0) 345 goto fail_free_reader; 346 347 cpu_buffer->head_page 348 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 349 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 350 351 return cpu_buffer; 352 353 fail_free_reader: 354 free_buffer_page(cpu_buffer->reader_page); 355 356 fail_free_buffer: 357 kfree(cpu_buffer); 358 return NULL; 359 } 360 361 static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 362 { 363 struct list_head *head = &cpu_buffer->pages; 364 struct buffer_page *page, *tmp; 365 366 list_del_init(&cpu_buffer->reader_page->list); 367 free_buffer_page(cpu_buffer->reader_page); 368 369 list_for_each_entry_safe(page, tmp, head, list) { 370 list_del_init(&page->list); 371 free_buffer_page(page); 372 } 373 kfree(cpu_buffer); 374 } 375 376 /* 377 * Causes compile errors if the struct buffer_page gets bigger 378 * than the struct page. 379 */ 380 extern int ring_buffer_page_too_big(void); 381 382 /** 383 * ring_buffer_alloc - allocate a new ring_buffer 384 * @size: the size in bytes that is needed. 385 * @flags: attributes to set for the ring buffer. 386 * 387 * Currently the only flag that is available is the RB_FL_OVERWRITE 388 * flag. This flag means that the buffer will overwrite old data 389 * when the buffer wraps. If this flag is not set, the buffer will 390 * drop data when the tail hits the head. 391 */ 392 struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 393 { 394 struct ring_buffer *buffer; 395 int bsize; 396 int cpu; 397 398 /* Paranoid! Optimizes out when all is well */ 399 if (sizeof(struct buffer_page) > sizeof(struct page)) 400 ring_buffer_page_too_big(); 401 402 403 /* keep it in its own cache line */ 404 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 405 GFP_KERNEL); 406 if (!buffer) 407 return NULL; 408 409 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 410 buffer->flags = flags; 411 412 /* need at least two pages */ 413 if (buffer->pages == 1) 414 buffer->pages++; 415 416 buffer->cpumask = cpu_possible_map; 417 buffer->cpus = nr_cpu_ids; 418 419 bsize = sizeof(void *) * nr_cpu_ids; 420 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 421 GFP_KERNEL); 422 if (!buffer->buffers) 423 goto fail_free_buffer; 424 425 for_each_buffer_cpu(buffer, cpu) { 426 buffer->buffers[cpu] = 427 rb_allocate_cpu_buffer(buffer, cpu); 428 if (!buffer->buffers[cpu]) 429 goto fail_free_buffers; 430 } 431 432 mutex_init(&buffer->mutex); 433 434 return buffer; 435 436 fail_free_buffers: 437 for_each_buffer_cpu(buffer, cpu) { 438 if (buffer->buffers[cpu]) 439 rb_free_cpu_buffer(buffer->buffers[cpu]); 440 } 441 kfree(buffer->buffers); 442 443 fail_free_buffer: 444 kfree(buffer); 445 return NULL; 446 } 447 448 /** 449 * ring_buffer_free - free a ring buffer. 450 * @buffer: the buffer to free. 451 */ 452 void 453 ring_buffer_free(struct ring_buffer *buffer) 454 { 455 int cpu; 456 457 for_each_buffer_cpu(buffer, cpu) 458 rb_free_cpu_buffer(buffer->buffers[cpu]); 459 460 kfree(buffer); 461 } 462 463 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 464 465 static void 466 rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) 467 { 468 struct buffer_page *page; 469 struct list_head *p; 470 unsigned i; 471 472 atomic_inc(&cpu_buffer->record_disabled); 473 synchronize_sched(); 474 475 for (i = 0; i < nr_pages; i++) { 476 BUG_ON(list_empty(&cpu_buffer->pages)); 477 p = cpu_buffer->pages.next; 478 page = list_entry(p, struct buffer_page, list); 479 list_del_init(&page->list); 480 free_buffer_page(page); 481 } 482 BUG_ON(list_empty(&cpu_buffer->pages)); 483 484 rb_reset_cpu(cpu_buffer); 485 486 rb_check_pages(cpu_buffer); 487 488 atomic_dec(&cpu_buffer->record_disabled); 489 490 } 491 492 static void 493 rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 494 struct list_head *pages, unsigned nr_pages) 495 { 496 struct buffer_page *page; 497 struct list_head *p; 498 unsigned i; 499 500 atomic_inc(&cpu_buffer->record_disabled); 501 synchronize_sched(); 502 503 for (i = 0; i < nr_pages; i++) { 504 BUG_ON(list_empty(pages)); 505 p = pages->next; 506 page = list_entry(p, struct buffer_page, list); 507 list_del_init(&page->list); 508 list_add_tail(&page->list, &cpu_buffer->pages); 509 } 510 rb_reset_cpu(cpu_buffer); 511 512 rb_check_pages(cpu_buffer); 513 514 atomic_dec(&cpu_buffer->record_disabled); 515 } 516 517 /** 518 * ring_buffer_resize - resize the ring buffer 519 * @buffer: the buffer to resize. 520 * @size: the new size. 521 * 522 * The tracer is responsible for making sure that the buffer is 523 * not being used while changing the size. 524 * Note: We may be able to change the above requirement by using 525 * RCU synchronizations. 526 * 527 * Minimum size is 2 * BUF_PAGE_SIZE. 528 * 529 * Returns -1 on failure. 530 */ 531 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) 532 { 533 struct ring_buffer_per_cpu *cpu_buffer; 534 unsigned nr_pages, rm_pages, new_pages; 535 struct buffer_page *page, *tmp; 536 unsigned long buffer_size; 537 unsigned long addr; 538 LIST_HEAD(pages); 539 int i, cpu; 540 541 /* 542 * Always succeed at resizing a non-existent buffer: 543 */ 544 if (!buffer) 545 return size; 546 547 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 548 size *= BUF_PAGE_SIZE; 549 buffer_size = buffer->pages * BUF_PAGE_SIZE; 550 551 /* we need a minimum of two pages */ 552 if (size < BUF_PAGE_SIZE * 2) 553 size = BUF_PAGE_SIZE * 2; 554 555 if (size == buffer_size) 556 return size; 557 558 mutex_lock(&buffer->mutex); 559 560 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 561 562 if (size < buffer_size) { 563 564 /* easy case, just free pages */ 565 BUG_ON(nr_pages >= buffer->pages); 566 567 rm_pages = buffer->pages - nr_pages; 568 569 for_each_buffer_cpu(buffer, cpu) { 570 cpu_buffer = buffer->buffers[cpu]; 571 rb_remove_pages(cpu_buffer, rm_pages); 572 } 573 goto out; 574 } 575 576 /* 577 * This is a bit more difficult. We only want to add pages 578 * when we can allocate enough for all CPUs. We do this 579 * by allocating all the pages and storing them on a local 580 * link list. If we succeed in our allocation, then we 581 * add these pages to the cpu_buffers. Otherwise we just free 582 * them all and return -ENOMEM; 583 */ 584 BUG_ON(nr_pages <= buffer->pages); 585 new_pages = nr_pages - buffer->pages; 586 587 for_each_buffer_cpu(buffer, cpu) { 588 for (i = 0; i < new_pages; i++) { 589 page = kzalloc_node(ALIGN(sizeof(*page), 590 cache_line_size()), 591 GFP_KERNEL, cpu_to_node(cpu)); 592 if (!page) 593 goto free_pages; 594 list_add(&page->list, &pages); 595 addr = __get_free_page(GFP_KERNEL); 596 if (!addr) 597 goto free_pages; 598 page->page = (void *)addr; 599 } 600 } 601 602 for_each_buffer_cpu(buffer, cpu) { 603 cpu_buffer = buffer->buffers[cpu]; 604 rb_insert_pages(cpu_buffer, &pages, new_pages); 605 } 606 607 BUG_ON(!list_empty(&pages)); 608 609 out: 610 buffer->pages = nr_pages; 611 mutex_unlock(&buffer->mutex); 612 613 return size; 614 615 free_pages: 616 list_for_each_entry_safe(page, tmp, &pages, list) { 617 list_del_init(&page->list); 618 free_buffer_page(page); 619 } 620 mutex_unlock(&buffer->mutex); 621 return -ENOMEM; 622 } 623 624 static inline int rb_null_event(struct ring_buffer_event *event) 625 { 626 return event->type == RINGBUF_TYPE_PADDING; 627 } 628 629 static inline void *__rb_page_index(struct buffer_page *page, unsigned index) 630 { 631 return page->page + index; 632 } 633 634 static inline struct ring_buffer_event * 635 rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) 636 { 637 return __rb_page_index(cpu_buffer->reader_page, 638 cpu_buffer->reader_page->read); 639 } 640 641 static inline struct ring_buffer_event * 642 rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) 643 { 644 return __rb_page_index(cpu_buffer->head_page, 645 cpu_buffer->head_page->read); 646 } 647 648 static inline struct ring_buffer_event * 649 rb_iter_head_event(struct ring_buffer_iter *iter) 650 { 651 return __rb_page_index(iter->head_page, iter->head); 652 } 653 654 static inline unsigned rb_page_write(struct buffer_page *bpage) 655 { 656 return local_read(&bpage->write); 657 } 658 659 static inline unsigned rb_page_commit(struct buffer_page *bpage) 660 { 661 return local_read(&bpage->commit); 662 } 663 664 /* Size is determined by what has been commited */ 665 static inline unsigned rb_page_size(struct buffer_page *bpage) 666 { 667 return rb_page_commit(bpage); 668 } 669 670 static inline unsigned 671 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) 672 { 673 return rb_page_commit(cpu_buffer->commit_page); 674 } 675 676 static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) 677 { 678 return rb_page_commit(cpu_buffer->head_page); 679 } 680 681 /* 682 * When the tail hits the head and the buffer is in overwrite mode, 683 * the head jumps to the next page and all content on the previous 684 * page is discarded. But before doing so, we update the overrun 685 * variable of the buffer. 686 */ 687 static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) 688 { 689 struct ring_buffer_event *event; 690 unsigned long head; 691 692 for (head = 0; head < rb_head_size(cpu_buffer); 693 head += rb_event_length(event)) { 694 695 event = __rb_page_index(cpu_buffer->head_page, head); 696 BUG_ON(rb_null_event(event)); 697 /* Only count data entries */ 698 if (event->type != RINGBUF_TYPE_DATA) 699 continue; 700 cpu_buffer->overrun++; 701 cpu_buffer->entries--; 702 } 703 } 704 705 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 706 struct buffer_page **page) 707 { 708 struct list_head *p = (*page)->list.next; 709 710 if (p == &cpu_buffer->pages) 711 p = p->next; 712 713 *page = list_entry(p, struct buffer_page, list); 714 } 715 716 static inline unsigned 717 rb_event_index(struct ring_buffer_event *event) 718 { 719 unsigned long addr = (unsigned long)event; 720 721 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 722 } 723 724 static inline int 725 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 726 struct ring_buffer_event *event) 727 { 728 unsigned long addr = (unsigned long)event; 729 unsigned long index; 730 731 index = rb_event_index(event); 732 addr &= PAGE_MASK; 733 734 return cpu_buffer->commit_page->page == (void *)addr && 735 rb_commit_index(cpu_buffer) == index; 736 } 737 738 static inline void 739 rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, 740 struct ring_buffer_event *event) 741 { 742 unsigned long addr = (unsigned long)event; 743 unsigned long index; 744 745 index = rb_event_index(event); 746 addr &= PAGE_MASK; 747 748 while (cpu_buffer->commit_page->page != (void *)addr) { 749 RB_WARN_ON(cpu_buffer, 750 cpu_buffer->commit_page == cpu_buffer->tail_page); 751 cpu_buffer->commit_page->commit = 752 cpu_buffer->commit_page->write; 753 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 754 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 755 } 756 757 /* Now set the commit to the event's index */ 758 local_set(&cpu_buffer->commit_page->commit, index); 759 } 760 761 static inline void 762 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 763 { 764 /* 765 * We only race with interrupts and NMIs on this CPU. 766 * If we own the commit event, then we can commit 767 * all others that interrupted us, since the interruptions 768 * are in stack format (they finish before they come 769 * back to us). This allows us to do a simple loop to 770 * assign the commit to the tail. 771 */ 772 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 773 cpu_buffer->commit_page->commit = 774 cpu_buffer->commit_page->write; 775 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 776 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 777 /* add barrier to keep gcc from optimizing too much */ 778 barrier(); 779 } 780 while (rb_commit_index(cpu_buffer) != 781 rb_page_write(cpu_buffer->commit_page)) { 782 cpu_buffer->commit_page->commit = 783 cpu_buffer->commit_page->write; 784 barrier(); 785 } 786 } 787 788 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 789 { 790 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; 791 cpu_buffer->reader_page->read = 0; 792 } 793 794 static inline void rb_inc_iter(struct ring_buffer_iter *iter) 795 { 796 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 797 798 /* 799 * The iterator could be on the reader page (it starts there). 800 * But the head could have moved, since the reader was 801 * found. Check for this case and assign the iterator 802 * to the head page instead of next. 803 */ 804 if (iter->head_page == cpu_buffer->reader_page) 805 iter->head_page = cpu_buffer->head_page; 806 else 807 rb_inc_page(cpu_buffer, &iter->head_page); 808 809 iter->read_stamp = iter->head_page->time_stamp; 810 iter->head = 0; 811 } 812 813 /** 814 * ring_buffer_update_event - update event type and data 815 * @event: the even to update 816 * @type: the type of event 817 * @length: the size of the event field in the ring buffer 818 * 819 * Update the type and data fields of the event. The length 820 * is the actual size that is written to the ring buffer, 821 * and with this, we can determine what to place into the 822 * data field. 823 */ 824 static inline void 825 rb_update_event(struct ring_buffer_event *event, 826 unsigned type, unsigned length) 827 { 828 event->type = type; 829 830 switch (type) { 831 832 case RINGBUF_TYPE_PADDING: 833 break; 834 835 case RINGBUF_TYPE_TIME_EXTEND: 836 event->len = 837 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) 838 >> RB_ALIGNMENT_SHIFT; 839 break; 840 841 case RINGBUF_TYPE_TIME_STAMP: 842 event->len = 843 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) 844 >> RB_ALIGNMENT_SHIFT; 845 break; 846 847 case RINGBUF_TYPE_DATA: 848 length -= RB_EVNT_HDR_SIZE; 849 if (length > RB_MAX_SMALL_DATA) { 850 event->len = 0; 851 event->array[0] = length; 852 } else 853 event->len = 854 (length + (RB_ALIGNMENT-1)) 855 >> RB_ALIGNMENT_SHIFT; 856 break; 857 default: 858 BUG(); 859 } 860 } 861 862 static inline unsigned rb_calculate_event_length(unsigned length) 863 { 864 struct ring_buffer_event event; /* Used only for sizeof array */ 865 866 /* zero length can cause confusions */ 867 if (!length) 868 length = 1; 869 870 if (length > RB_MAX_SMALL_DATA) 871 length += sizeof(event.array[0]); 872 873 length += RB_EVNT_HDR_SIZE; 874 length = ALIGN(length, RB_ALIGNMENT); 875 876 return length; 877 } 878 879 static struct ring_buffer_event * 880 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 881 unsigned type, unsigned long length, u64 *ts) 882 { 883 struct buffer_page *tail_page, *head_page, *reader_page; 884 unsigned long tail, write; 885 struct ring_buffer *buffer = cpu_buffer->buffer; 886 struct ring_buffer_event *event; 887 unsigned long flags; 888 889 tail_page = cpu_buffer->tail_page; 890 write = local_add_return(length, &tail_page->write); 891 tail = write - length; 892 893 /* See if we shot pass the end of this buffer page */ 894 if (write > BUF_PAGE_SIZE) { 895 struct buffer_page *next_page = tail_page; 896 897 spin_lock_irqsave(&cpu_buffer->lock, flags); 898 899 rb_inc_page(cpu_buffer, &next_page); 900 901 head_page = cpu_buffer->head_page; 902 reader_page = cpu_buffer->reader_page; 903 904 /* we grabbed the lock before incrementing */ 905 RB_WARN_ON(cpu_buffer, next_page == reader_page); 906 907 /* 908 * If for some reason, we had an interrupt storm that made 909 * it all the way around the buffer, bail, and warn 910 * about it. 911 */ 912 if (unlikely(next_page == cpu_buffer->commit_page)) { 913 WARN_ON_ONCE(1); 914 goto out_unlock; 915 } 916 917 if (next_page == head_page) { 918 if (!(buffer->flags & RB_FL_OVERWRITE)) { 919 /* reset write */ 920 if (tail <= BUF_PAGE_SIZE) 921 local_set(&tail_page->write, tail); 922 goto out_unlock; 923 } 924 925 /* tail_page has not moved yet? */ 926 if (tail_page == cpu_buffer->tail_page) { 927 /* count overflows */ 928 rb_update_overflow(cpu_buffer); 929 930 rb_inc_page(cpu_buffer, &head_page); 931 cpu_buffer->head_page = head_page; 932 cpu_buffer->head_page->read = 0; 933 } 934 } 935 936 /* 937 * If the tail page is still the same as what we think 938 * it is, then it is up to us to update the tail 939 * pointer. 940 */ 941 if (tail_page == cpu_buffer->tail_page) { 942 local_set(&next_page->write, 0); 943 local_set(&next_page->commit, 0); 944 cpu_buffer->tail_page = next_page; 945 946 /* reread the time stamp */ 947 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 948 cpu_buffer->tail_page->time_stamp = *ts; 949 } 950 951 /* 952 * The actual tail page has moved forward. 953 */ 954 if (tail < BUF_PAGE_SIZE) { 955 /* Mark the rest of the page with padding */ 956 event = __rb_page_index(tail_page, tail); 957 event->type = RINGBUF_TYPE_PADDING; 958 } 959 960 if (tail <= BUF_PAGE_SIZE) 961 /* Set the write back to the previous setting */ 962 local_set(&tail_page->write, tail); 963 964 /* 965 * If this was a commit entry that failed, 966 * increment that too 967 */ 968 if (tail_page == cpu_buffer->commit_page && 969 tail == rb_commit_index(cpu_buffer)) { 970 rb_set_commit_to_write(cpu_buffer); 971 } 972 973 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 974 975 /* fail and let the caller try again */ 976 return ERR_PTR(-EAGAIN); 977 } 978 979 /* We reserved something on the buffer */ 980 981 BUG_ON(write > BUF_PAGE_SIZE); 982 983 event = __rb_page_index(tail_page, tail); 984 rb_update_event(event, type, length); 985 986 /* 987 * If this is a commit and the tail is zero, then update 988 * this page's time stamp. 989 */ 990 if (!tail && rb_is_commit(cpu_buffer, event)) 991 cpu_buffer->commit_page->time_stamp = *ts; 992 993 return event; 994 995 out_unlock: 996 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 997 return NULL; 998 } 999 1000 static int 1001 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1002 u64 *ts, u64 *delta) 1003 { 1004 struct ring_buffer_event *event; 1005 static int once; 1006 int ret; 1007 1008 if (unlikely(*delta > (1ULL << 59) && !once++)) { 1009 printk(KERN_WARNING "Delta way too big! %llu" 1010 " ts=%llu write stamp = %llu\n", 1011 (unsigned long long)*delta, 1012 (unsigned long long)*ts, 1013 (unsigned long long)cpu_buffer->write_stamp); 1014 WARN_ON(1); 1015 } 1016 1017 /* 1018 * The delta is too big, we to add a 1019 * new timestamp. 1020 */ 1021 event = __rb_reserve_next(cpu_buffer, 1022 RINGBUF_TYPE_TIME_EXTEND, 1023 RB_LEN_TIME_EXTEND, 1024 ts); 1025 if (!event) 1026 return -EBUSY; 1027 1028 if (PTR_ERR(event) == -EAGAIN) 1029 return -EAGAIN; 1030 1031 /* Only a commited time event can update the write stamp */ 1032 if (rb_is_commit(cpu_buffer, event)) { 1033 /* 1034 * If this is the first on the page, then we need to 1035 * update the page itself, and just put in a zero. 1036 */ 1037 if (rb_event_index(event)) { 1038 event->time_delta = *delta & TS_MASK; 1039 event->array[0] = *delta >> TS_SHIFT; 1040 } else { 1041 cpu_buffer->commit_page->time_stamp = *ts; 1042 event->time_delta = 0; 1043 event->array[0] = 0; 1044 } 1045 cpu_buffer->write_stamp = *ts; 1046 /* let the caller know this was the commit */ 1047 ret = 1; 1048 } else { 1049 /* Darn, this is just wasted space */ 1050 event->time_delta = 0; 1051 event->array[0] = 0; 1052 ret = 0; 1053 } 1054 1055 *delta = 0; 1056 1057 return ret; 1058 } 1059 1060 static struct ring_buffer_event * 1061 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1062 unsigned type, unsigned long length) 1063 { 1064 struct ring_buffer_event *event; 1065 u64 ts, delta; 1066 int commit = 0; 1067 int nr_loops = 0; 1068 1069 again: 1070 /* 1071 * We allow for interrupts to reenter here and do a trace. 1072 * If one does, it will cause this original code to loop 1073 * back here. Even with heavy interrupts happening, this 1074 * should only happen a few times in a row. If this happens 1075 * 1000 times in a row, there must be either an interrupt 1076 * storm or we have something buggy. 1077 * Bail! 1078 */ 1079 if (unlikely(++nr_loops > 1000)) { 1080 RB_WARN_ON(cpu_buffer, 1); 1081 return NULL; 1082 } 1083 1084 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1085 1086 /* 1087 * Only the first commit can update the timestamp. 1088 * Yes there is a race here. If an interrupt comes in 1089 * just after the conditional and it traces too, then it 1090 * will also check the deltas. More than one timestamp may 1091 * also be made. But only the entry that did the actual 1092 * commit will be something other than zero. 1093 */ 1094 if (cpu_buffer->tail_page == cpu_buffer->commit_page && 1095 rb_page_write(cpu_buffer->tail_page) == 1096 rb_commit_index(cpu_buffer)) { 1097 1098 delta = ts - cpu_buffer->write_stamp; 1099 1100 /* make sure this delta is calculated here */ 1101 barrier(); 1102 1103 /* Did the write stamp get updated already? */ 1104 if (unlikely(ts < cpu_buffer->write_stamp)) 1105 delta = 0; 1106 1107 if (test_time_stamp(delta)) { 1108 1109 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1110 1111 if (commit == -EBUSY) 1112 return NULL; 1113 1114 if (commit == -EAGAIN) 1115 goto again; 1116 1117 RB_WARN_ON(cpu_buffer, commit < 0); 1118 } 1119 } else 1120 /* Non commits have zero deltas */ 1121 delta = 0; 1122 1123 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1124 if (PTR_ERR(event) == -EAGAIN) 1125 goto again; 1126 1127 if (!event) { 1128 if (unlikely(commit)) 1129 /* 1130 * Ouch! We needed a timestamp and it was commited. But 1131 * we didn't get our event reserved. 1132 */ 1133 rb_set_commit_to_write(cpu_buffer); 1134 return NULL; 1135 } 1136 1137 /* 1138 * If the timestamp was commited, make the commit our entry 1139 * now so that we will update it when needed. 1140 */ 1141 if (commit) 1142 rb_set_commit_event(cpu_buffer, event); 1143 else if (!rb_is_commit(cpu_buffer, event)) 1144 delta = 0; 1145 1146 event->time_delta = delta; 1147 1148 return event; 1149 } 1150 1151 static DEFINE_PER_CPU(int, rb_need_resched); 1152 1153 /** 1154 * ring_buffer_lock_reserve - reserve a part of the buffer 1155 * @buffer: the ring buffer to reserve from 1156 * @length: the length of the data to reserve (excluding event header) 1157 * @flags: a pointer to save the interrupt flags 1158 * 1159 * Returns a reseverd event on the ring buffer to copy directly to. 1160 * The user of this interface will need to get the body to write into 1161 * and can use the ring_buffer_event_data() interface. 1162 * 1163 * The length is the length of the data needed, not the event length 1164 * which also includes the event header. 1165 * 1166 * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. 1167 * If NULL is returned, then nothing has been allocated or locked. 1168 */ 1169 struct ring_buffer_event * 1170 ring_buffer_lock_reserve(struct ring_buffer *buffer, 1171 unsigned long length, 1172 unsigned long *flags) 1173 { 1174 struct ring_buffer_per_cpu *cpu_buffer; 1175 struct ring_buffer_event *event; 1176 int cpu, resched; 1177 1178 if (ring_buffers_off) 1179 return NULL; 1180 1181 if (atomic_read(&buffer->record_disabled)) 1182 return NULL; 1183 1184 /* If we are tracing schedule, we don't want to recurse */ 1185 resched = need_resched(); 1186 preempt_disable_notrace(); 1187 1188 cpu = raw_smp_processor_id(); 1189 1190 if (!cpu_isset(cpu, buffer->cpumask)) 1191 goto out; 1192 1193 cpu_buffer = buffer->buffers[cpu]; 1194 1195 if (atomic_read(&cpu_buffer->record_disabled)) 1196 goto out; 1197 1198 length = rb_calculate_event_length(length); 1199 if (length > BUF_PAGE_SIZE) 1200 goto out; 1201 1202 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1203 if (!event) 1204 goto out; 1205 1206 /* 1207 * Need to store resched state on this cpu. 1208 * Only the first needs to. 1209 */ 1210 1211 if (preempt_count() == 1) 1212 per_cpu(rb_need_resched, cpu) = resched; 1213 1214 return event; 1215 1216 out: 1217 if (resched) 1218 preempt_enable_no_resched_notrace(); 1219 else 1220 preempt_enable_notrace(); 1221 return NULL; 1222 } 1223 1224 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1225 struct ring_buffer_event *event) 1226 { 1227 cpu_buffer->entries++; 1228 1229 /* Only process further if we own the commit */ 1230 if (!rb_is_commit(cpu_buffer, event)) 1231 return; 1232 1233 cpu_buffer->write_stamp += event->time_delta; 1234 1235 rb_set_commit_to_write(cpu_buffer); 1236 } 1237 1238 /** 1239 * ring_buffer_unlock_commit - commit a reserved 1240 * @buffer: The buffer to commit to 1241 * @event: The event pointer to commit. 1242 * @flags: the interrupt flags received from ring_buffer_lock_reserve. 1243 * 1244 * This commits the data to the ring buffer, and releases any locks held. 1245 * 1246 * Must be paired with ring_buffer_lock_reserve. 1247 */ 1248 int ring_buffer_unlock_commit(struct ring_buffer *buffer, 1249 struct ring_buffer_event *event, 1250 unsigned long flags) 1251 { 1252 struct ring_buffer_per_cpu *cpu_buffer; 1253 int cpu = raw_smp_processor_id(); 1254 1255 cpu_buffer = buffer->buffers[cpu]; 1256 1257 rb_commit(cpu_buffer, event); 1258 1259 /* 1260 * Only the last preempt count needs to restore preemption. 1261 */ 1262 if (preempt_count() == 1) { 1263 if (per_cpu(rb_need_resched, cpu)) 1264 preempt_enable_no_resched_notrace(); 1265 else 1266 preempt_enable_notrace(); 1267 } else 1268 preempt_enable_no_resched_notrace(); 1269 1270 return 0; 1271 } 1272 1273 /** 1274 * ring_buffer_write - write data to the buffer without reserving 1275 * @buffer: The ring buffer to write to. 1276 * @length: The length of the data being written (excluding the event header) 1277 * @data: The data to write to the buffer. 1278 * 1279 * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as 1280 * one function. If you already have the data to write to the buffer, it 1281 * may be easier to simply call this function. 1282 * 1283 * Note, like ring_buffer_lock_reserve, the length is the length of the data 1284 * and not the length of the event which would hold the header. 1285 */ 1286 int ring_buffer_write(struct ring_buffer *buffer, 1287 unsigned long length, 1288 void *data) 1289 { 1290 struct ring_buffer_per_cpu *cpu_buffer; 1291 struct ring_buffer_event *event; 1292 unsigned long event_length; 1293 void *body; 1294 int ret = -EBUSY; 1295 int cpu, resched; 1296 1297 if (ring_buffers_off) 1298 return -EBUSY; 1299 1300 if (atomic_read(&buffer->record_disabled)) 1301 return -EBUSY; 1302 1303 resched = need_resched(); 1304 preempt_disable_notrace(); 1305 1306 cpu = raw_smp_processor_id(); 1307 1308 if (!cpu_isset(cpu, buffer->cpumask)) 1309 goto out; 1310 1311 cpu_buffer = buffer->buffers[cpu]; 1312 1313 if (atomic_read(&cpu_buffer->record_disabled)) 1314 goto out; 1315 1316 event_length = rb_calculate_event_length(length); 1317 event = rb_reserve_next_event(cpu_buffer, 1318 RINGBUF_TYPE_DATA, event_length); 1319 if (!event) 1320 goto out; 1321 1322 body = rb_event_data(event); 1323 1324 memcpy(body, data, length); 1325 1326 rb_commit(cpu_buffer, event); 1327 1328 ret = 0; 1329 out: 1330 if (resched) 1331 preempt_enable_no_resched_notrace(); 1332 else 1333 preempt_enable_notrace(); 1334 1335 return ret; 1336 } 1337 1338 static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1339 { 1340 struct buffer_page *reader = cpu_buffer->reader_page; 1341 struct buffer_page *head = cpu_buffer->head_page; 1342 struct buffer_page *commit = cpu_buffer->commit_page; 1343 1344 return reader->read == rb_page_commit(reader) && 1345 (commit == reader || 1346 (commit == head && 1347 head->read == rb_page_commit(commit))); 1348 } 1349 1350 /** 1351 * ring_buffer_record_disable - stop all writes into the buffer 1352 * @buffer: The ring buffer to stop writes to. 1353 * 1354 * This prevents all writes to the buffer. Any attempt to write 1355 * to the buffer after this will fail and return NULL. 1356 * 1357 * The caller should call synchronize_sched() after this. 1358 */ 1359 void ring_buffer_record_disable(struct ring_buffer *buffer) 1360 { 1361 atomic_inc(&buffer->record_disabled); 1362 } 1363 1364 /** 1365 * ring_buffer_record_enable - enable writes to the buffer 1366 * @buffer: The ring buffer to enable writes 1367 * 1368 * Note, multiple disables will need the same number of enables 1369 * to truely enable the writing (much like preempt_disable). 1370 */ 1371 void ring_buffer_record_enable(struct ring_buffer *buffer) 1372 { 1373 atomic_dec(&buffer->record_disabled); 1374 } 1375 1376 /** 1377 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 1378 * @buffer: The ring buffer to stop writes to. 1379 * @cpu: The CPU buffer to stop 1380 * 1381 * This prevents all writes to the buffer. Any attempt to write 1382 * to the buffer after this will fail and return NULL. 1383 * 1384 * The caller should call synchronize_sched() after this. 1385 */ 1386 void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) 1387 { 1388 struct ring_buffer_per_cpu *cpu_buffer; 1389 1390 if (!cpu_isset(cpu, buffer->cpumask)) 1391 return; 1392 1393 cpu_buffer = buffer->buffers[cpu]; 1394 atomic_inc(&cpu_buffer->record_disabled); 1395 } 1396 1397 /** 1398 * ring_buffer_record_enable_cpu - enable writes to the buffer 1399 * @buffer: The ring buffer to enable writes 1400 * @cpu: The CPU to enable. 1401 * 1402 * Note, multiple disables will need the same number of enables 1403 * to truely enable the writing (much like preempt_disable). 1404 */ 1405 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 1406 { 1407 struct ring_buffer_per_cpu *cpu_buffer; 1408 1409 if (!cpu_isset(cpu, buffer->cpumask)) 1410 return; 1411 1412 cpu_buffer = buffer->buffers[cpu]; 1413 atomic_dec(&cpu_buffer->record_disabled); 1414 } 1415 1416 /** 1417 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 1418 * @buffer: The ring buffer 1419 * @cpu: The per CPU buffer to get the entries from. 1420 */ 1421 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 1422 { 1423 struct ring_buffer_per_cpu *cpu_buffer; 1424 1425 if (!cpu_isset(cpu, buffer->cpumask)) 1426 return 0; 1427 1428 cpu_buffer = buffer->buffers[cpu]; 1429 return cpu_buffer->entries; 1430 } 1431 1432 /** 1433 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 1434 * @buffer: The ring buffer 1435 * @cpu: The per CPU buffer to get the number of overruns from 1436 */ 1437 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) 1438 { 1439 struct ring_buffer_per_cpu *cpu_buffer; 1440 1441 if (!cpu_isset(cpu, buffer->cpumask)) 1442 return 0; 1443 1444 cpu_buffer = buffer->buffers[cpu]; 1445 return cpu_buffer->overrun; 1446 } 1447 1448 /** 1449 * ring_buffer_entries - get the number of entries in a buffer 1450 * @buffer: The ring buffer 1451 * 1452 * Returns the total number of entries in the ring buffer 1453 * (all CPU entries) 1454 */ 1455 unsigned long ring_buffer_entries(struct ring_buffer *buffer) 1456 { 1457 struct ring_buffer_per_cpu *cpu_buffer; 1458 unsigned long entries = 0; 1459 int cpu; 1460 1461 /* if you care about this being correct, lock the buffer */ 1462 for_each_buffer_cpu(buffer, cpu) { 1463 cpu_buffer = buffer->buffers[cpu]; 1464 entries += cpu_buffer->entries; 1465 } 1466 1467 return entries; 1468 } 1469 1470 /** 1471 * ring_buffer_overrun_cpu - get the number of overruns in buffer 1472 * @buffer: The ring buffer 1473 * 1474 * Returns the total number of overruns in the ring buffer 1475 * (all CPU entries) 1476 */ 1477 unsigned long ring_buffer_overruns(struct ring_buffer *buffer) 1478 { 1479 struct ring_buffer_per_cpu *cpu_buffer; 1480 unsigned long overruns = 0; 1481 int cpu; 1482 1483 /* if you care about this being correct, lock the buffer */ 1484 for_each_buffer_cpu(buffer, cpu) { 1485 cpu_buffer = buffer->buffers[cpu]; 1486 overruns += cpu_buffer->overrun; 1487 } 1488 1489 return overruns; 1490 } 1491 1492 /** 1493 * ring_buffer_iter_reset - reset an iterator 1494 * @iter: The iterator to reset 1495 * 1496 * Resets the iterator, so that it will start from the beginning 1497 * again. 1498 */ 1499 void ring_buffer_iter_reset(struct ring_buffer_iter *iter) 1500 { 1501 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1502 1503 /* Iterator usage is expected to have record disabled */ 1504 if (list_empty(&cpu_buffer->reader_page->list)) { 1505 iter->head_page = cpu_buffer->head_page; 1506 iter->head = cpu_buffer->head_page->read; 1507 } else { 1508 iter->head_page = cpu_buffer->reader_page; 1509 iter->head = cpu_buffer->reader_page->read; 1510 } 1511 if (iter->head) 1512 iter->read_stamp = cpu_buffer->read_stamp; 1513 else 1514 iter->read_stamp = iter->head_page->time_stamp; 1515 } 1516 1517 /** 1518 * ring_buffer_iter_empty - check if an iterator has no more to read 1519 * @iter: The iterator to check 1520 */ 1521 int ring_buffer_iter_empty(struct ring_buffer_iter *iter) 1522 { 1523 struct ring_buffer_per_cpu *cpu_buffer; 1524 1525 cpu_buffer = iter->cpu_buffer; 1526 1527 return iter->head_page == cpu_buffer->commit_page && 1528 iter->head == rb_commit_index(cpu_buffer); 1529 } 1530 1531 static void 1532 rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1533 struct ring_buffer_event *event) 1534 { 1535 u64 delta; 1536 1537 switch (event->type) { 1538 case RINGBUF_TYPE_PADDING: 1539 return; 1540 1541 case RINGBUF_TYPE_TIME_EXTEND: 1542 delta = event->array[0]; 1543 delta <<= TS_SHIFT; 1544 delta += event->time_delta; 1545 cpu_buffer->read_stamp += delta; 1546 return; 1547 1548 case RINGBUF_TYPE_TIME_STAMP: 1549 /* FIXME: not implemented */ 1550 return; 1551 1552 case RINGBUF_TYPE_DATA: 1553 cpu_buffer->read_stamp += event->time_delta; 1554 return; 1555 1556 default: 1557 BUG(); 1558 } 1559 return; 1560 } 1561 1562 static void 1563 rb_update_iter_read_stamp(struct ring_buffer_iter *iter, 1564 struct ring_buffer_event *event) 1565 { 1566 u64 delta; 1567 1568 switch (event->type) { 1569 case RINGBUF_TYPE_PADDING: 1570 return; 1571 1572 case RINGBUF_TYPE_TIME_EXTEND: 1573 delta = event->array[0]; 1574 delta <<= TS_SHIFT; 1575 delta += event->time_delta; 1576 iter->read_stamp += delta; 1577 return; 1578 1579 case RINGBUF_TYPE_TIME_STAMP: 1580 /* FIXME: not implemented */ 1581 return; 1582 1583 case RINGBUF_TYPE_DATA: 1584 iter->read_stamp += event->time_delta; 1585 return; 1586 1587 default: 1588 BUG(); 1589 } 1590 return; 1591 } 1592 1593 static struct buffer_page * 1594 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 1595 { 1596 struct buffer_page *reader = NULL; 1597 unsigned long flags; 1598 int nr_loops = 0; 1599 1600 spin_lock_irqsave(&cpu_buffer->lock, flags); 1601 1602 again: 1603 /* 1604 * This should normally only loop twice. But because the 1605 * start of the reader inserts an empty page, it causes 1606 * a case where we will loop three times. There should be no 1607 * reason to loop four times (that I know of). 1608 */ 1609 if (unlikely(++nr_loops > 3)) { 1610 RB_WARN_ON(cpu_buffer, 1); 1611 reader = NULL; 1612 goto out; 1613 } 1614 1615 reader = cpu_buffer->reader_page; 1616 1617 /* If there's more to read, return this page */ 1618 if (cpu_buffer->reader_page->read < rb_page_size(reader)) 1619 goto out; 1620 1621 /* Never should we have an index greater than the size */ 1622 RB_WARN_ON(cpu_buffer, 1623 cpu_buffer->reader_page->read > rb_page_size(reader)); 1624 1625 /* check if we caught up to the tail */ 1626 reader = NULL; 1627 if (cpu_buffer->commit_page == cpu_buffer->reader_page) 1628 goto out; 1629 1630 /* 1631 * Splice the empty reader page into the list around the head. 1632 * Reset the reader page to size zero. 1633 */ 1634 1635 reader = cpu_buffer->head_page; 1636 cpu_buffer->reader_page->list.next = reader->list.next; 1637 cpu_buffer->reader_page->list.prev = reader->list.prev; 1638 1639 local_set(&cpu_buffer->reader_page->write, 0); 1640 local_set(&cpu_buffer->reader_page->commit, 0); 1641 1642 /* Make the reader page now replace the head */ 1643 reader->list.prev->next = &cpu_buffer->reader_page->list; 1644 reader->list.next->prev = &cpu_buffer->reader_page->list; 1645 1646 /* 1647 * If the tail is on the reader, then we must set the head 1648 * to the inserted page, otherwise we set it one before. 1649 */ 1650 cpu_buffer->head_page = cpu_buffer->reader_page; 1651 1652 if (cpu_buffer->commit_page != reader) 1653 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 1654 1655 /* Finally update the reader page to the new head */ 1656 cpu_buffer->reader_page = reader; 1657 rb_reset_reader_page(cpu_buffer); 1658 1659 goto again; 1660 1661 out: 1662 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1663 1664 return reader; 1665 } 1666 1667 static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) 1668 { 1669 struct ring_buffer_event *event; 1670 struct buffer_page *reader; 1671 unsigned length; 1672 1673 reader = rb_get_reader_page(cpu_buffer); 1674 1675 /* This function should not be called when buffer is empty */ 1676 BUG_ON(!reader); 1677 1678 event = rb_reader_event(cpu_buffer); 1679 1680 if (event->type == RINGBUF_TYPE_DATA) 1681 cpu_buffer->entries--; 1682 1683 rb_update_read_stamp(cpu_buffer, event); 1684 1685 length = rb_event_length(event); 1686 cpu_buffer->reader_page->read += length; 1687 } 1688 1689 static void rb_advance_iter(struct ring_buffer_iter *iter) 1690 { 1691 struct ring_buffer *buffer; 1692 struct ring_buffer_per_cpu *cpu_buffer; 1693 struct ring_buffer_event *event; 1694 unsigned length; 1695 1696 cpu_buffer = iter->cpu_buffer; 1697 buffer = cpu_buffer->buffer; 1698 1699 /* 1700 * Check if we are at the end of the buffer. 1701 */ 1702 if (iter->head >= rb_page_size(iter->head_page)) { 1703 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1704 rb_inc_iter(iter); 1705 return; 1706 } 1707 1708 event = rb_iter_head_event(iter); 1709 1710 length = rb_event_length(event); 1711 1712 /* 1713 * This should not be called to advance the header if we are 1714 * at the tail of the buffer. 1715 */ 1716 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1717 (iter->head + length > rb_commit_index(cpu_buffer))); 1718 1719 rb_update_iter_read_stamp(iter, event); 1720 1721 iter->head += length; 1722 1723 /* check for end of page padding */ 1724 if ((iter->head >= rb_page_size(iter->head_page)) && 1725 (iter->head_page != cpu_buffer->commit_page)) 1726 rb_advance_iter(iter); 1727 } 1728 1729 /** 1730 * ring_buffer_peek - peek at the next event to be read 1731 * @buffer: The ring buffer to read 1732 * @cpu: The cpu to peak at 1733 * @ts: The timestamp counter of this event. 1734 * 1735 * This will return the event that will be read next, but does 1736 * not consume the data. 1737 */ 1738 struct ring_buffer_event * 1739 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 1740 { 1741 struct ring_buffer_per_cpu *cpu_buffer; 1742 struct ring_buffer_event *event; 1743 struct buffer_page *reader; 1744 int nr_loops = 0; 1745 1746 if (!cpu_isset(cpu, buffer->cpumask)) 1747 return NULL; 1748 1749 cpu_buffer = buffer->buffers[cpu]; 1750 1751 again: 1752 /* 1753 * We repeat when a timestamp is encountered. It is possible 1754 * to get multiple timestamps from an interrupt entering just 1755 * as one timestamp is about to be written. The max times 1756 * that this can happen is the number of nested interrupts we 1757 * can have. Nesting 10 deep of interrupts is clearly 1758 * an anomaly. 1759 */ 1760 if (unlikely(++nr_loops > 10)) { 1761 RB_WARN_ON(cpu_buffer, 1); 1762 return NULL; 1763 } 1764 1765 reader = rb_get_reader_page(cpu_buffer); 1766 if (!reader) 1767 return NULL; 1768 1769 event = rb_reader_event(cpu_buffer); 1770 1771 switch (event->type) { 1772 case RINGBUF_TYPE_PADDING: 1773 RB_WARN_ON(cpu_buffer, 1); 1774 rb_advance_reader(cpu_buffer); 1775 return NULL; 1776 1777 case RINGBUF_TYPE_TIME_EXTEND: 1778 /* Internal data, OK to advance */ 1779 rb_advance_reader(cpu_buffer); 1780 goto again; 1781 1782 case RINGBUF_TYPE_TIME_STAMP: 1783 /* FIXME: not implemented */ 1784 rb_advance_reader(cpu_buffer); 1785 goto again; 1786 1787 case RINGBUF_TYPE_DATA: 1788 if (ts) { 1789 *ts = cpu_buffer->read_stamp + event->time_delta; 1790 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 1791 } 1792 return event; 1793 1794 default: 1795 BUG(); 1796 } 1797 1798 return NULL; 1799 } 1800 1801 /** 1802 * ring_buffer_iter_peek - peek at the next event to be read 1803 * @iter: The ring buffer iterator 1804 * @ts: The timestamp counter of this event. 1805 * 1806 * This will return the event that will be read next, but does 1807 * not increment the iterator. 1808 */ 1809 struct ring_buffer_event * 1810 ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) 1811 { 1812 struct ring_buffer *buffer; 1813 struct ring_buffer_per_cpu *cpu_buffer; 1814 struct ring_buffer_event *event; 1815 int nr_loops = 0; 1816 1817 if (ring_buffer_iter_empty(iter)) 1818 return NULL; 1819 1820 cpu_buffer = iter->cpu_buffer; 1821 buffer = cpu_buffer->buffer; 1822 1823 again: 1824 /* 1825 * We repeat when a timestamp is encountered. It is possible 1826 * to get multiple timestamps from an interrupt entering just 1827 * as one timestamp is about to be written. The max times 1828 * that this can happen is the number of nested interrupts we 1829 * can have. Nesting 10 deep of interrupts is clearly 1830 * an anomaly. 1831 */ 1832 if (unlikely(++nr_loops > 10)) { 1833 RB_WARN_ON(cpu_buffer, 1); 1834 return NULL; 1835 } 1836 1837 if (rb_per_cpu_empty(cpu_buffer)) 1838 return NULL; 1839 1840 event = rb_iter_head_event(iter); 1841 1842 switch (event->type) { 1843 case RINGBUF_TYPE_PADDING: 1844 rb_inc_iter(iter); 1845 goto again; 1846 1847 case RINGBUF_TYPE_TIME_EXTEND: 1848 /* Internal data, OK to advance */ 1849 rb_advance_iter(iter); 1850 goto again; 1851 1852 case RINGBUF_TYPE_TIME_STAMP: 1853 /* FIXME: not implemented */ 1854 rb_advance_iter(iter); 1855 goto again; 1856 1857 case RINGBUF_TYPE_DATA: 1858 if (ts) { 1859 *ts = iter->read_stamp + event->time_delta; 1860 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 1861 } 1862 return event; 1863 1864 default: 1865 BUG(); 1866 } 1867 1868 return NULL; 1869 } 1870 1871 /** 1872 * ring_buffer_consume - return an event and consume it 1873 * @buffer: The ring buffer to get the next event from 1874 * 1875 * Returns the next event in the ring buffer, and that event is consumed. 1876 * Meaning, that sequential reads will keep returning a different event, 1877 * and eventually empty the ring buffer if the producer is slower. 1878 */ 1879 struct ring_buffer_event * 1880 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 1881 { 1882 struct ring_buffer_per_cpu *cpu_buffer; 1883 struct ring_buffer_event *event; 1884 1885 if (!cpu_isset(cpu, buffer->cpumask)) 1886 return NULL; 1887 1888 event = ring_buffer_peek(buffer, cpu, ts); 1889 if (!event) 1890 return NULL; 1891 1892 cpu_buffer = buffer->buffers[cpu]; 1893 rb_advance_reader(cpu_buffer); 1894 1895 return event; 1896 } 1897 1898 /** 1899 * ring_buffer_read_start - start a non consuming read of the buffer 1900 * @buffer: The ring buffer to read from 1901 * @cpu: The cpu buffer to iterate over 1902 * 1903 * This starts up an iteration through the buffer. It also disables 1904 * the recording to the buffer until the reading is finished. 1905 * This prevents the reading from being corrupted. This is not 1906 * a consuming read, so a producer is not expected. 1907 * 1908 * Must be paired with ring_buffer_finish. 1909 */ 1910 struct ring_buffer_iter * 1911 ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 1912 { 1913 struct ring_buffer_per_cpu *cpu_buffer; 1914 struct ring_buffer_iter *iter; 1915 unsigned long flags; 1916 1917 if (!cpu_isset(cpu, buffer->cpumask)) 1918 return NULL; 1919 1920 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 1921 if (!iter) 1922 return NULL; 1923 1924 cpu_buffer = buffer->buffers[cpu]; 1925 1926 iter->cpu_buffer = cpu_buffer; 1927 1928 atomic_inc(&cpu_buffer->record_disabled); 1929 synchronize_sched(); 1930 1931 spin_lock_irqsave(&cpu_buffer->lock, flags); 1932 ring_buffer_iter_reset(iter); 1933 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1934 1935 return iter; 1936 } 1937 1938 /** 1939 * ring_buffer_finish - finish reading the iterator of the buffer 1940 * @iter: The iterator retrieved by ring_buffer_start 1941 * 1942 * This re-enables the recording to the buffer, and frees the 1943 * iterator. 1944 */ 1945 void 1946 ring_buffer_read_finish(struct ring_buffer_iter *iter) 1947 { 1948 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1949 1950 atomic_dec(&cpu_buffer->record_disabled); 1951 kfree(iter); 1952 } 1953 1954 /** 1955 * ring_buffer_read - read the next item in the ring buffer by the iterator 1956 * @iter: The ring buffer iterator 1957 * @ts: The time stamp of the event read. 1958 * 1959 * This reads the next event in the ring buffer and increments the iterator. 1960 */ 1961 struct ring_buffer_event * 1962 ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 1963 { 1964 struct ring_buffer_event *event; 1965 1966 event = ring_buffer_iter_peek(iter, ts); 1967 if (!event) 1968 return NULL; 1969 1970 rb_advance_iter(iter); 1971 1972 return event; 1973 } 1974 1975 /** 1976 * ring_buffer_size - return the size of the ring buffer (in bytes) 1977 * @buffer: The ring buffer. 1978 */ 1979 unsigned long ring_buffer_size(struct ring_buffer *buffer) 1980 { 1981 return BUF_PAGE_SIZE * buffer->pages; 1982 } 1983 1984 static void 1985 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 1986 { 1987 cpu_buffer->head_page 1988 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1989 local_set(&cpu_buffer->head_page->write, 0); 1990 local_set(&cpu_buffer->head_page->commit, 0); 1991 1992 cpu_buffer->head_page->read = 0; 1993 1994 cpu_buffer->tail_page = cpu_buffer->head_page; 1995 cpu_buffer->commit_page = cpu_buffer->head_page; 1996 1997 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1998 local_set(&cpu_buffer->reader_page->write, 0); 1999 local_set(&cpu_buffer->reader_page->commit, 0); 2000 cpu_buffer->reader_page->read = 0; 2001 2002 cpu_buffer->overrun = 0; 2003 cpu_buffer->entries = 0; 2004 } 2005 2006 /** 2007 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer 2008 * @buffer: The ring buffer to reset a per cpu buffer of 2009 * @cpu: The CPU buffer to be reset 2010 */ 2011 void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) 2012 { 2013 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2014 unsigned long flags; 2015 2016 if (!cpu_isset(cpu, buffer->cpumask)) 2017 return; 2018 2019 spin_lock_irqsave(&cpu_buffer->lock, flags); 2020 2021 rb_reset_cpu(cpu_buffer); 2022 2023 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2024 } 2025 2026 /** 2027 * ring_buffer_reset - reset a ring buffer 2028 * @buffer: The ring buffer to reset all cpu buffers 2029 */ 2030 void ring_buffer_reset(struct ring_buffer *buffer) 2031 { 2032 int cpu; 2033 2034 for_each_buffer_cpu(buffer, cpu) 2035 ring_buffer_reset_cpu(buffer, cpu); 2036 } 2037 2038 /** 2039 * rind_buffer_empty - is the ring buffer empty? 2040 * @buffer: The ring buffer to test 2041 */ 2042 int ring_buffer_empty(struct ring_buffer *buffer) 2043 { 2044 struct ring_buffer_per_cpu *cpu_buffer; 2045 int cpu; 2046 2047 /* yes this is racy, but if you don't like the race, lock the buffer */ 2048 for_each_buffer_cpu(buffer, cpu) { 2049 cpu_buffer = buffer->buffers[cpu]; 2050 if (!rb_per_cpu_empty(cpu_buffer)) 2051 return 0; 2052 } 2053 return 1; 2054 } 2055 2056 /** 2057 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? 2058 * @buffer: The ring buffer 2059 * @cpu: The CPU buffer to test 2060 */ 2061 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2062 { 2063 struct ring_buffer_per_cpu *cpu_buffer; 2064 2065 if (!cpu_isset(cpu, buffer->cpumask)) 2066 return 1; 2067 2068 cpu_buffer = buffer->buffers[cpu]; 2069 return rb_per_cpu_empty(cpu_buffer); 2070 } 2071 2072 /** 2073 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 2074 * @buffer_a: One buffer to swap with 2075 * @buffer_b: The other buffer to swap with 2076 * 2077 * This function is useful for tracers that want to take a "snapshot" 2078 * of a CPU buffer and has another back up buffer lying around. 2079 * it is expected that the tracer handles the cpu buffer not being 2080 * used at the moment. 2081 */ 2082 int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, 2083 struct ring_buffer *buffer_b, int cpu) 2084 { 2085 struct ring_buffer_per_cpu *cpu_buffer_a; 2086 struct ring_buffer_per_cpu *cpu_buffer_b; 2087 2088 if (!cpu_isset(cpu, buffer_a->cpumask) || 2089 !cpu_isset(cpu, buffer_b->cpumask)) 2090 return -EINVAL; 2091 2092 /* At least make sure the two buffers are somewhat the same */ 2093 if (buffer_a->size != buffer_b->size || 2094 buffer_a->pages != buffer_b->pages) 2095 return -EINVAL; 2096 2097 cpu_buffer_a = buffer_a->buffers[cpu]; 2098 cpu_buffer_b = buffer_b->buffers[cpu]; 2099 2100 /* 2101 * We can't do a synchronize_sched here because this 2102 * function can be called in atomic context. 2103 * Normally this will be called from the same CPU as cpu. 2104 * If not it's up to the caller to protect this. 2105 */ 2106 atomic_inc(&cpu_buffer_a->record_disabled); 2107 atomic_inc(&cpu_buffer_b->record_disabled); 2108 2109 buffer_a->buffers[cpu] = cpu_buffer_b; 2110 buffer_b->buffers[cpu] = cpu_buffer_a; 2111 2112 cpu_buffer_b->buffer = buffer_a; 2113 cpu_buffer_a->buffer = buffer_b; 2114 2115 atomic_dec(&cpu_buffer_a->record_disabled); 2116 atomic_dec(&cpu_buffer_b->record_disabled); 2117 2118 return 0; 2119 } 2120 2121 static ssize_t 2122 rb_simple_read(struct file *filp, char __user *ubuf, 2123 size_t cnt, loff_t *ppos) 2124 { 2125 int *p = filp->private_data; 2126 char buf[64]; 2127 int r; 2128 2129 /* !ring_buffers_off == tracing_on */ 2130 r = sprintf(buf, "%d\n", !*p); 2131 2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2133 } 2134 2135 static ssize_t 2136 rb_simple_write(struct file *filp, const char __user *ubuf, 2137 size_t cnt, loff_t *ppos) 2138 { 2139 int *p = filp->private_data; 2140 char buf[64]; 2141 long val; 2142 int ret; 2143 2144 if (cnt >= sizeof(buf)) 2145 return -EINVAL; 2146 2147 if (copy_from_user(&buf, ubuf, cnt)) 2148 return -EFAULT; 2149 2150 buf[cnt] = 0; 2151 2152 ret = strict_strtoul(buf, 10, &val); 2153 if (ret < 0) 2154 return ret; 2155 2156 /* !ring_buffers_off == tracing_on */ 2157 *p = !val; 2158 2159 (*ppos)++; 2160 2161 return cnt; 2162 } 2163 2164 static struct file_operations rb_simple_fops = { 2165 .open = tracing_open_generic, 2166 .read = rb_simple_read, 2167 .write = rb_simple_write, 2168 }; 2169 2170 2171 static __init int rb_init_debugfs(void) 2172 { 2173 struct dentry *d_tracer; 2174 struct dentry *entry; 2175 2176 d_tracer = tracing_init_dentry(); 2177 2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 2179 &ring_buffers_off, &rb_simple_fops); 2180 if (!entry) 2181 pr_warning("Could not create debugfs 'tracing_on' entry\n"); 2182 2183 return 0; 2184 } 2185 2186 fs_initcall(rb_init_debugfs); 2187