xref: /linux/kernel/events/ring_buffer.c (revision ff5599816711d2e67da2d7561fd36ac48debd433)
1 /*
2  * Performance events ring-buffer code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11 
12 #include <linux/perf_event.h>
13 #include <linux/vmalloc.h>
14 #include <linux/slab.h>
15 
16 #include "internal.h"
17 
18 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 			      unsigned long offset, unsigned long head)
20 {
21 	unsigned long sz = perf_data_size(rb);
22 	unsigned long mask = sz - 1;
23 
24 	/*
25 	 * check if user-writable
26 	 * overwrite : over-write its own tail
27 	 * !overwrite: buffer possibly drops events.
28 	 */
29 	if (rb->overwrite)
30 		return true;
31 
32 	/*
33 	 * verify that payload is not bigger than buffer
34 	 * otherwise masking logic may fail to detect
35 	 * the "not enough space" condition
36 	 */
37 	if ((head - offset) > sz)
38 		return false;
39 
40 	offset = (offset - tail) & mask;
41 	head   = (head   - tail) & mask;
42 
43 	if ((int)(head - offset) < 0)
44 		return false;
45 
46 	return true;
47 }
48 
49 static void perf_output_wakeup(struct perf_output_handle *handle)
50 {
51 	atomic_set(&handle->rb->poll, POLL_IN);
52 
53 	handle->event->pending_wakeup = 1;
54 	irq_work_queue(&handle->event->pending);
55 }
56 
57 /*
58  * We need to ensure a later event_id doesn't publish a head when a former
59  * event isn't done writing. However since we need to deal with NMIs we
60  * cannot fully serialize things.
61  *
62  * We only publish the head (and generate a wakeup) when the outer-most
63  * event completes.
64  */
65 static void perf_output_get_handle(struct perf_output_handle *handle)
66 {
67 	struct ring_buffer *rb = handle->rb;
68 
69 	preempt_disable();
70 	local_inc(&rb->nest);
71 	handle->wakeup = local_read(&rb->wakeup);
72 }
73 
74 static void perf_output_put_handle(struct perf_output_handle *handle)
75 {
76 	struct ring_buffer *rb = handle->rb;
77 	unsigned long head;
78 
79 again:
80 	head = local_read(&rb->head);
81 
82 	/*
83 	 * IRQ/NMI can happen here, which means we can miss a head update.
84 	 */
85 
86 	if (!local_dec_and_test(&rb->nest))
87 		goto out;
88 
89 	/*
90 	 * Publish the known good head. Rely on the full barrier implied
91 	 * by atomic_dec_and_test() order the rb->head read and this
92 	 * write.
93 	 */
94 	rb->user_page->data_head = head;
95 
96 	/*
97 	 * Now check if we missed an update, rely on the (compiler)
98 	 * barrier in atomic_dec_and_test() to re-read rb->head.
99 	 */
100 	if (unlikely(head != local_read(&rb->head))) {
101 		local_inc(&rb->nest);
102 		goto again;
103 	}
104 
105 	if (handle->wakeup != local_read(&rb->wakeup))
106 		perf_output_wakeup(handle);
107 
108 out:
109 	preempt_enable();
110 }
111 
112 int perf_output_begin(struct perf_output_handle *handle,
113 		      struct perf_event *event, unsigned int size)
114 {
115 	struct ring_buffer *rb;
116 	unsigned long tail, offset, head;
117 	int have_lost;
118 	struct perf_sample_data sample_data;
119 	struct {
120 		struct perf_event_header header;
121 		u64			 id;
122 		u64			 lost;
123 	} lost_event;
124 
125 	rcu_read_lock();
126 	/*
127 	 * For inherited events we send all the output towards the parent.
128 	 */
129 	if (event->parent)
130 		event = event->parent;
131 
132 	rb = rcu_dereference(event->rb);
133 	if (!rb)
134 		goto out;
135 
136 	handle->rb	= rb;
137 	handle->event	= event;
138 
139 	if (!rb->nr_pages)
140 		goto out;
141 
142 	have_lost = local_read(&rb->lost);
143 	if (have_lost) {
144 		lost_event.header.size = sizeof(lost_event);
145 		perf_event_header__init_id(&lost_event.header, &sample_data,
146 					   event);
147 		size += lost_event.header.size;
148 	}
149 
150 	perf_output_get_handle(handle);
151 
152 	do {
153 		/*
154 		 * Userspace could choose to issue a mb() before updating the
155 		 * tail pointer. So that all reads will be completed before the
156 		 * write is issued.
157 		 */
158 		tail = ACCESS_ONCE(rb->user_page->data_tail);
159 		smp_rmb();
160 		offset = head = local_read(&rb->head);
161 		head += size;
162 		if (unlikely(!perf_output_space(rb, tail, offset, head)))
163 			goto fail;
164 	} while (local_cmpxchg(&rb->head, offset, head) != offset);
165 
166 	if (head - local_read(&rb->wakeup) > rb->watermark)
167 		local_add(rb->watermark, &rb->wakeup);
168 
169 	handle->page = offset >> (PAGE_SHIFT + page_order(rb));
170 	handle->page &= rb->nr_pages - 1;
171 	handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
172 	handle->addr = rb->data_pages[handle->page];
173 	handle->addr += handle->size;
174 	handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
175 
176 	if (have_lost) {
177 		lost_event.header.type = PERF_RECORD_LOST;
178 		lost_event.header.misc = 0;
179 		lost_event.id          = event->id;
180 		lost_event.lost        = local_xchg(&rb->lost, 0);
181 
182 		perf_output_put(handle, lost_event);
183 		perf_event__output_id_sample(event, handle, &sample_data);
184 	}
185 
186 	return 0;
187 
188 fail:
189 	local_inc(&rb->lost);
190 	perf_output_put_handle(handle);
191 out:
192 	rcu_read_unlock();
193 
194 	return -ENOSPC;
195 }
196 
197 unsigned int perf_output_copy(struct perf_output_handle *handle,
198 		      const void *buf, unsigned int len)
199 {
200 	return __output_copy(handle, buf, len);
201 }
202 
203 unsigned int perf_output_skip(struct perf_output_handle *handle,
204 			      unsigned int len)
205 {
206 	return __output_skip(handle, NULL, len);
207 }
208 
209 void perf_output_end(struct perf_output_handle *handle)
210 {
211 	perf_output_put_handle(handle);
212 	rcu_read_unlock();
213 }
214 
215 static void
216 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
217 {
218 	long max_size = perf_data_size(rb);
219 
220 	if (watermark)
221 		rb->watermark = min(max_size, watermark);
222 
223 	if (!rb->watermark)
224 		rb->watermark = max_size / 2;
225 
226 	if (flags & RING_BUFFER_WRITABLE)
227 		rb->overwrite = 0;
228 	else
229 		rb->overwrite = 1;
230 
231 	atomic_set(&rb->refcount, 1);
232 
233 	INIT_LIST_HEAD(&rb->event_list);
234 	spin_lock_init(&rb->event_lock);
235 }
236 
237 #ifndef CONFIG_PERF_USE_VMALLOC
238 
239 /*
240  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
241  */
242 
243 struct page *
244 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
245 {
246 	if (pgoff > rb->nr_pages)
247 		return NULL;
248 
249 	if (pgoff == 0)
250 		return virt_to_page(rb->user_page);
251 
252 	return virt_to_page(rb->data_pages[pgoff - 1]);
253 }
254 
255 static void *perf_mmap_alloc_page(int cpu)
256 {
257 	struct page *page;
258 	int node;
259 
260 	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
261 	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
262 	if (!page)
263 		return NULL;
264 
265 	return page_address(page);
266 }
267 
268 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
269 {
270 	struct ring_buffer *rb;
271 	unsigned long size;
272 	int i;
273 
274 	size = sizeof(struct ring_buffer);
275 	size += nr_pages * sizeof(void *);
276 
277 	rb = kzalloc(size, GFP_KERNEL);
278 	if (!rb)
279 		goto fail;
280 
281 	rb->user_page = perf_mmap_alloc_page(cpu);
282 	if (!rb->user_page)
283 		goto fail_user_page;
284 
285 	for (i = 0; i < nr_pages; i++) {
286 		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
287 		if (!rb->data_pages[i])
288 			goto fail_data_pages;
289 	}
290 
291 	rb->nr_pages = nr_pages;
292 
293 	ring_buffer_init(rb, watermark, flags);
294 
295 	return rb;
296 
297 fail_data_pages:
298 	for (i--; i >= 0; i--)
299 		free_page((unsigned long)rb->data_pages[i]);
300 
301 	free_page((unsigned long)rb->user_page);
302 
303 fail_user_page:
304 	kfree(rb);
305 
306 fail:
307 	return NULL;
308 }
309 
310 static void perf_mmap_free_page(unsigned long addr)
311 {
312 	struct page *page = virt_to_page((void *)addr);
313 
314 	page->mapping = NULL;
315 	__free_page(page);
316 }
317 
318 void rb_free(struct ring_buffer *rb)
319 {
320 	int i;
321 
322 	perf_mmap_free_page((unsigned long)rb->user_page);
323 	for (i = 0; i < rb->nr_pages; i++)
324 		perf_mmap_free_page((unsigned long)rb->data_pages[i]);
325 	kfree(rb);
326 }
327 
328 #else
329 static int data_page_nr(struct ring_buffer *rb)
330 {
331 	return rb->nr_pages << page_order(rb);
332 }
333 
334 struct page *
335 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
336 {
337 	/* The '>' counts in the user page. */
338 	if (pgoff > data_page_nr(rb))
339 		return NULL;
340 
341 	return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
342 }
343 
344 static void perf_mmap_unmark_page(void *addr)
345 {
346 	struct page *page = vmalloc_to_page(addr);
347 
348 	page->mapping = NULL;
349 }
350 
351 static void rb_free_work(struct work_struct *work)
352 {
353 	struct ring_buffer *rb;
354 	void *base;
355 	int i, nr;
356 
357 	rb = container_of(work, struct ring_buffer, work);
358 	nr = data_page_nr(rb);
359 
360 	base = rb->user_page;
361 	/* The '<=' counts in the user page. */
362 	for (i = 0; i <= nr; i++)
363 		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
364 
365 	vfree(base);
366 	kfree(rb);
367 }
368 
369 void rb_free(struct ring_buffer *rb)
370 {
371 	schedule_work(&rb->work);
372 }
373 
374 struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
375 {
376 	struct ring_buffer *rb;
377 	unsigned long size;
378 	void *all_buf;
379 
380 	size = sizeof(struct ring_buffer);
381 	size += sizeof(void *);
382 
383 	rb = kzalloc(size, GFP_KERNEL);
384 	if (!rb)
385 		goto fail;
386 
387 	INIT_WORK(&rb->work, rb_free_work);
388 
389 	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
390 	if (!all_buf)
391 		goto fail_all_buf;
392 
393 	rb->user_page = all_buf;
394 	rb->data_pages[0] = all_buf + PAGE_SIZE;
395 	rb->page_order = ilog2(nr_pages);
396 	rb->nr_pages = !!nr_pages;
397 
398 	ring_buffer_init(rb, watermark, flags);
399 
400 	return rb;
401 
402 fail_all_buf:
403 	kfree(rb);
404 
405 fail:
406 	return NULL;
407 }
408 
409 #endif
410