1 /* 2 * Performance events ring-buffer code: 3 * 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 8 * 9 * For licensing details see kernel-base/COPYING 10 */ 11 12 #include <linux/perf_event.h> 13 #include <linux/vmalloc.h> 14 #include <linux/slab.h> 15 16 #include "internal.h" 17 18 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 19 unsigned long offset, unsigned long head) 20 { 21 unsigned long sz = perf_data_size(rb); 22 unsigned long mask = sz - 1; 23 24 /* 25 * check if user-writable 26 * overwrite : over-write its own tail 27 * !overwrite: buffer possibly drops events. 28 */ 29 if (rb->overwrite) 30 return true; 31 32 /* 33 * verify that payload is not bigger than buffer 34 * otherwise masking logic may fail to detect 35 * the "not enough space" condition 36 */ 37 if ((head - offset) > sz) 38 return false; 39 40 offset = (offset - tail) & mask; 41 head = (head - tail) & mask; 42 43 if ((int)(head - offset) < 0) 44 return false; 45 46 return true; 47 } 48 49 static void perf_output_wakeup(struct perf_output_handle *handle) 50 { 51 atomic_set(&handle->rb->poll, POLL_IN); 52 53 handle->event->pending_wakeup = 1; 54 irq_work_queue(&handle->event->pending); 55 } 56 57 /* 58 * We need to ensure a later event_id doesn't publish a head when a former 59 * event isn't done writing. However since we need to deal with NMIs we 60 * cannot fully serialize things. 61 * 62 * We only publish the head (and generate a wakeup) when the outer-most 63 * event completes. 64 */ 65 static void perf_output_get_handle(struct perf_output_handle *handle) 66 { 67 struct ring_buffer *rb = handle->rb; 68 69 preempt_disable(); 70 local_inc(&rb->nest); 71 handle->wakeup = local_read(&rb->wakeup); 72 } 73 74 static void perf_output_put_handle(struct perf_output_handle *handle) 75 { 76 struct ring_buffer *rb = handle->rb; 77 unsigned long head; 78 79 again: 80 head = local_read(&rb->head); 81 82 /* 83 * IRQ/NMI can happen here, which means we can miss a head update. 84 */ 85 86 if (!local_dec_and_test(&rb->nest)) 87 goto out; 88 89 /* 90 * Publish the known good head. Rely on the full barrier implied 91 * by atomic_dec_and_test() order the rb->head read and this 92 * write. 93 */ 94 rb->user_page->data_head = head; 95 96 /* 97 * Now check if we missed an update, rely on the (compiler) 98 * barrier in atomic_dec_and_test() to re-read rb->head. 99 */ 100 if (unlikely(head != local_read(&rb->head))) { 101 local_inc(&rb->nest); 102 goto again; 103 } 104 105 if (handle->wakeup != local_read(&rb->wakeup)) 106 perf_output_wakeup(handle); 107 108 out: 109 preempt_enable(); 110 } 111 112 int perf_output_begin(struct perf_output_handle *handle, 113 struct perf_event *event, unsigned int size) 114 { 115 struct ring_buffer *rb; 116 unsigned long tail, offset, head; 117 int have_lost; 118 struct perf_sample_data sample_data; 119 struct { 120 struct perf_event_header header; 121 u64 id; 122 u64 lost; 123 } lost_event; 124 125 rcu_read_lock(); 126 /* 127 * For inherited events we send all the output towards the parent. 128 */ 129 if (event->parent) 130 event = event->parent; 131 132 rb = rcu_dereference(event->rb); 133 if (!rb) 134 goto out; 135 136 handle->rb = rb; 137 handle->event = event; 138 139 if (!rb->nr_pages) 140 goto out; 141 142 have_lost = local_read(&rb->lost); 143 if (have_lost) { 144 lost_event.header.size = sizeof(lost_event); 145 perf_event_header__init_id(&lost_event.header, &sample_data, 146 event); 147 size += lost_event.header.size; 148 } 149 150 perf_output_get_handle(handle); 151 152 do { 153 /* 154 * Userspace could choose to issue a mb() before updating the 155 * tail pointer. So that all reads will be completed before the 156 * write is issued. 157 */ 158 tail = ACCESS_ONCE(rb->user_page->data_tail); 159 smp_rmb(); 160 offset = head = local_read(&rb->head); 161 head += size; 162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 163 goto fail; 164 } while (local_cmpxchg(&rb->head, offset, head) != offset); 165 166 if (head - local_read(&rb->wakeup) > rb->watermark) 167 local_add(rb->watermark, &rb->wakeup); 168 169 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 170 handle->page &= rb->nr_pages - 1; 171 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); 172 handle->addr = rb->data_pages[handle->page]; 173 handle->addr += handle->size; 174 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; 175 176 if (have_lost) { 177 lost_event.header.type = PERF_RECORD_LOST; 178 lost_event.header.misc = 0; 179 lost_event.id = event->id; 180 lost_event.lost = local_xchg(&rb->lost, 0); 181 182 perf_output_put(handle, lost_event); 183 perf_event__output_id_sample(event, handle, &sample_data); 184 } 185 186 return 0; 187 188 fail: 189 local_inc(&rb->lost); 190 perf_output_put_handle(handle); 191 out: 192 rcu_read_unlock(); 193 194 return -ENOSPC; 195 } 196 197 unsigned int perf_output_copy(struct perf_output_handle *handle, 198 const void *buf, unsigned int len) 199 { 200 return __output_copy(handle, buf, len); 201 } 202 203 unsigned int perf_output_skip(struct perf_output_handle *handle, 204 unsigned int len) 205 { 206 return __output_skip(handle, NULL, len); 207 } 208 209 void perf_output_end(struct perf_output_handle *handle) 210 { 211 perf_output_put_handle(handle); 212 rcu_read_unlock(); 213 } 214 215 static void 216 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) 217 { 218 long max_size = perf_data_size(rb); 219 220 if (watermark) 221 rb->watermark = min(max_size, watermark); 222 223 if (!rb->watermark) 224 rb->watermark = max_size / 2; 225 226 if (flags & RING_BUFFER_WRITABLE) 227 rb->overwrite = 0; 228 else 229 rb->overwrite = 1; 230 231 atomic_set(&rb->refcount, 1); 232 233 INIT_LIST_HEAD(&rb->event_list); 234 spin_lock_init(&rb->event_lock); 235 } 236 237 #ifndef CONFIG_PERF_USE_VMALLOC 238 239 /* 240 * Back perf_mmap() with regular GFP_KERNEL-0 pages. 241 */ 242 243 struct page * 244 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 245 { 246 if (pgoff > rb->nr_pages) 247 return NULL; 248 249 if (pgoff == 0) 250 return virt_to_page(rb->user_page); 251 252 return virt_to_page(rb->data_pages[pgoff - 1]); 253 } 254 255 static void *perf_mmap_alloc_page(int cpu) 256 { 257 struct page *page; 258 int node; 259 260 node = (cpu == -1) ? cpu : cpu_to_node(cpu); 261 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 262 if (!page) 263 return NULL; 264 265 return page_address(page); 266 } 267 268 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 269 { 270 struct ring_buffer *rb; 271 unsigned long size; 272 int i; 273 274 size = sizeof(struct ring_buffer); 275 size += nr_pages * sizeof(void *); 276 277 rb = kzalloc(size, GFP_KERNEL); 278 if (!rb) 279 goto fail; 280 281 rb->user_page = perf_mmap_alloc_page(cpu); 282 if (!rb->user_page) 283 goto fail_user_page; 284 285 for (i = 0; i < nr_pages; i++) { 286 rb->data_pages[i] = perf_mmap_alloc_page(cpu); 287 if (!rb->data_pages[i]) 288 goto fail_data_pages; 289 } 290 291 rb->nr_pages = nr_pages; 292 293 ring_buffer_init(rb, watermark, flags); 294 295 return rb; 296 297 fail_data_pages: 298 for (i--; i >= 0; i--) 299 free_page((unsigned long)rb->data_pages[i]); 300 301 free_page((unsigned long)rb->user_page); 302 303 fail_user_page: 304 kfree(rb); 305 306 fail: 307 return NULL; 308 } 309 310 static void perf_mmap_free_page(unsigned long addr) 311 { 312 struct page *page = virt_to_page((void *)addr); 313 314 page->mapping = NULL; 315 __free_page(page); 316 } 317 318 void rb_free(struct ring_buffer *rb) 319 { 320 int i; 321 322 perf_mmap_free_page((unsigned long)rb->user_page); 323 for (i = 0; i < rb->nr_pages; i++) 324 perf_mmap_free_page((unsigned long)rb->data_pages[i]); 325 kfree(rb); 326 } 327 328 #else 329 static int data_page_nr(struct ring_buffer *rb) 330 { 331 return rb->nr_pages << page_order(rb); 332 } 333 334 struct page * 335 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 336 { 337 /* The '>' counts in the user page. */ 338 if (pgoff > data_page_nr(rb)) 339 return NULL; 340 341 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 342 } 343 344 static void perf_mmap_unmark_page(void *addr) 345 { 346 struct page *page = vmalloc_to_page(addr); 347 348 page->mapping = NULL; 349 } 350 351 static void rb_free_work(struct work_struct *work) 352 { 353 struct ring_buffer *rb; 354 void *base; 355 int i, nr; 356 357 rb = container_of(work, struct ring_buffer, work); 358 nr = data_page_nr(rb); 359 360 base = rb->user_page; 361 /* The '<=' counts in the user page. */ 362 for (i = 0; i <= nr; i++) 363 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 364 365 vfree(base); 366 kfree(rb); 367 } 368 369 void rb_free(struct ring_buffer *rb) 370 { 371 schedule_work(&rb->work); 372 } 373 374 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) 375 { 376 struct ring_buffer *rb; 377 unsigned long size; 378 void *all_buf; 379 380 size = sizeof(struct ring_buffer); 381 size += sizeof(void *); 382 383 rb = kzalloc(size, GFP_KERNEL); 384 if (!rb) 385 goto fail; 386 387 INIT_WORK(&rb->work, rb_free_work); 388 389 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); 390 if (!all_buf) 391 goto fail_all_buf; 392 393 rb->user_page = all_buf; 394 rb->data_pages[0] = all_buf + PAGE_SIZE; 395 rb->page_order = ilog2(nr_pages); 396 rb->nr_pages = !!nr_pages; 397 398 ring_buffer_init(rb, watermark, flags); 399 400 return rb; 401 402 fail_all_buf: 403 kfree(rb); 404 405 fail: 406 return NULL; 407 } 408 409 #endif 410