xref: /linux/arch/powerpc/perf/vpa-dtl.c (revision 03f76ddff5b04a808ae16c06418460151e2fdd4b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Perf interface to expose Dispatch Trace Log counters.
4  *
5  * Copyright (C) 2024 Kajol Jain, IBM Corporation
6  */
7 
8 #ifdef CONFIG_PPC_SPLPAR
9 #define pr_fmt(fmt) "vpa_dtl: " fmt
10 
11 #include <asm/dtl.h>
12 #include <linux/perf_event.h>
13 #include <asm/plpar_wrappers.h>
14 #include <linux/vmalloc.h>
15 
16 #define EVENT(_name, _code)     enum{_name = _code}
17 
18 /*
19  * Based on Power Architecture Platform Reference(PAPR) documentation,
20  * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL)
21  * Enable Mask used to get corresponding virtual processor dispatch
22  * to preempt traces:
23  *   DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual
24  *   processor waits
25  *   DTL_PREEMPT(0x2): Trace time slice preempts
26  *   DTL_FAULT(0x4): Trace virtual partition memory page
27  faults.
28  *   DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT)
29  *
30  * Event codes based on Dispatch Trace Log Enable Mask.
31  */
32 EVENT(DTL_CEDE,         0x1);
33 EVENT(DTL_PREEMPT,      0x2);
34 EVENT(DTL_FAULT,        0x4);
35 EVENT(DTL_ALL,          0x7);
36 
37 GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
38 GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
39 GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
40 GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
41 
42 PMU_FORMAT_ATTR(event, "config:0-7");
43 
44 static struct attribute *events_attr[] = {
45 	GENERIC_EVENT_PTR(DTL_CEDE),
46 	GENERIC_EVENT_PTR(DTL_PREEMPT),
47 	GENERIC_EVENT_PTR(DTL_FAULT),
48 	GENERIC_EVENT_PTR(DTL_ALL),
49 	NULL
50 };
51 
52 static struct attribute_group event_group = {
53 	.name = "events",
54 	.attrs = events_attr,
55 };
56 
57 static struct attribute *format_attrs[] = {
58 	&format_attr_event.attr,
59 	NULL,
60 };
61 
62 static const struct attribute_group format_group = {
63 	.name = "format",
64 	.attrs = format_attrs,
65 };
66 
67 static const struct attribute_group *attr_groups[] = {
68 	&format_group,
69 	&event_group,
70 	NULL,
71 };
72 
73 struct vpa_dtl {
74 	struct dtl_entry	*buf;
75 	u64			last_idx;
76 };
77 
78 struct vpa_pmu_ctx {
79 	struct perf_output_handle handle;
80 };
81 
82 struct vpa_pmu_buf {
83 	int     nr_pages;
84 	bool    snapshot;
85 	u64     *base;
86 	u64     size;
87 	u64     head;
88 	u64	head_size;
89 	/* boot timebase and frequency needs to be saved only at once */
90 	int	boottb_freq_saved;
91 	u64	threshold;
92 	bool	full;
93 };
94 
95 /*
96  * To corelate each DTL entry with other events across CPU's,
97  * we need to map timebase from "struct dtl_entry" which phyp
98  * provides with boot timebase. This also needs timebase frequency.
99  * Formula is: ((timbase from DTL entry - boot time) / frequency)
100  *
101  * To match with size of "struct dtl_entry" to ease post processing,
102  * padded 24 bytes to the structure.
103  */
104 struct boottb_freq {
105 	u64	boot_tb;
106 	u64	tb_freq;
107 	u64	timebase;
108 	u64	padded[3];
109 };
110 
111 static DEFINE_PER_CPU(struct vpa_pmu_ctx, vpa_pmu_ctx);
112 static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu);
113 
114 /* variable to capture reference count for the active dtl threads */
115 static int dtl_global_refc;
116 static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
117 
118 /*
119  * Capture DTL data in AUX buffer
120  */
121 static void vpa_dtl_capture_aux(long *n_entries, struct vpa_pmu_buf *buf,
122 		struct vpa_dtl *dtl, int index)
123 {
124 	struct dtl_entry *aux_copy_buf = (struct dtl_entry *)buf->base;
125 
126 	/*
127 	 * check if there is enough space to contain the
128 	 * DTL data. If not, save the data for available
129 	 * memory and set full to true.
130 	 */
131 	if (buf->head + *n_entries >= buf->threshold) {
132 		*n_entries = buf->threshold - buf->head;
133 		buf->full = 1;
134 	}
135 
136 	/*
137 	 * Copy to AUX buffer from per-thread address
138 	 */
139 	memcpy(aux_copy_buf + buf->head, &dtl->buf[index], *n_entries * sizeof(struct dtl_entry));
140 
141 	if (buf->full) {
142 		/*
143 		 * Set head of private aux to zero when buffer is full
144 		 * so that next data will be copied to beginning of the
145 		 * buffer
146 		 */
147 		buf->head = 0;
148 		return;
149 	}
150 
151 	buf->head += *n_entries;
152 
153 	return;
154 }
155 
156 /*
157  * Function to dump the dispatch trace log buffer data to the
158  * perf data.
159  *
160  * perf_aux_output_begin: This function is called before writing
161  * to AUX area. This returns the pointer to aux area private structure,
162  * ie "struct vpa_pmu_buf" here which is set in setup_aux() function.
163  * The function obtains the output handle (used in perf_aux_output_end).
164  * when capture completes in vpa_dtl_capture_aux(), call perf_aux_output_end()
165  * to commit the recorded data.
166  *
167  * perf_aux_output_end: This function commits data by adjusting the
168  * aux_head of "struct perf_buffer". aux_tail will be moved in perf tools
169  * side when writing the data from aux buffer to perf.data file in disk.
170  *
171  * Here in the private aux structure, we maintain head to know where
172  * to copy data next time in the PMU driver. vpa_pmu_buf->head is moved to
173  * maintain the aux head for PMU driver. It is responsiblity of PMU
174  * driver to make sure data is copied between perf_aux_output_begin and
175  * perf_aux_output_end.
176  *
177  * After data is copied in vpa_dtl_capture_aux() function, perf_aux_output_end()
178  * is called to move the aux->head of "struct perf_buffer" to indicate size of
179  * data in aux buffer. This will post a PERF_RECORD_AUX into the perf buffer.
180  * Data will be written to disk only when the allocated buffer is full.
181  *
182  * By this approach, all the DTL data will be present as-is in the
183  * perf.data. The data will be pre-processed in perf tools side when doing
184  * perf report/perf script and this will avoid time taken to create samples
185  * in the kernel space.
186  */
187 static void vpa_dtl_dump_sample_data(struct perf_event *event)
188 {
189 	u64 cur_idx, last_idx, i;
190 	u64 boot_tb;
191 	struct boottb_freq boottb_freq;
192 
193 	/* actual number of entries read */
194 	long n_read = 0, read_size = 0;
195 
196 	/* number of entries added to dtl buffer */
197 	long n_req;
198 
199 	struct vpa_pmu_ctx *vpa_ctx = this_cpu_ptr(&vpa_pmu_ctx);
200 
201 	struct vpa_pmu_buf *aux_buf;
202 
203 	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
204 	u64 size;
205 
206 	cur_idx = be64_to_cpu(lppaca_of(event->cpu).dtl_idx);
207 	last_idx = dtl->last_idx;
208 
209 	if (last_idx + N_DISPATCH_LOG <= cur_idx)
210 		last_idx = cur_idx - N_DISPATCH_LOG + 1;
211 
212 	n_req = cur_idx - last_idx;
213 
214 	/* no new entry added to the buffer, return */
215 	if (n_req <= 0)
216 		return;
217 
218 	dtl->last_idx = last_idx + n_req;
219 	boot_tb = get_boot_tb();
220 
221 	i = last_idx % N_DISPATCH_LOG;
222 
223 	aux_buf = perf_aux_output_begin(&vpa_ctx->handle, event);
224 	if (!aux_buf) {
225 		pr_debug("returning. no aux\n");
226 		return;
227 	}
228 
229 	if (!aux_buf->boottb_freq_saved) {
230 		pr_debug("Copying boot tb to aux buffer: %lld\n", boot_tb);
231 		/* Save boot_tb to convert raw timebase to it's relative system boot time */
232 		boottb_freq.boot_tb = boot_tb;
233 		/* Save tb_ticks_per_sec to convert timebase to sec */
234 		boottb_freq.tb_freq = tb_ticks_per_sec;
235 		boottb_freq.timebase = 0;
236 		memcpy(aux_buf->base, &boottb_freq, sizeof(boottb_freq));
237 		aux_buf->head += 1;
238 		aux_buf->boottb_freq_saved = 1;
239 		n_read += 1;
240 	}
241 
242 	/* read the tail of the buffer if we've wrapped */
243 	if (i + n_req > N_DISPATCH_LOG) {
244 		read_size = N_DISPATCH_LOG - i;
245 		vpa_dtl_capture_aux(&read_size, aux_buf, dtl, i);
246 		n_req -= read_size;
247 		n_read += read_size;
248 		i = 0;
249 		if (aux_buf->full) {
250 			size = (n_read * sizeof(struct dtl_entry));
251 			if ((size +  aux_buf->head_size) > aux_buf->size) {
252 				size = aux_buf->size - aux_buf->head_size;
253 				perf_aux_output_end(&vpa_ctx->handle, size);
254 				aux_buf->head = 0;
255 				aux_buf->head_size = 0;
256 			} else {
257 				aux_buf->head_size += (n_read * sizeof(struct dtl_entry));
258 				perf_aux_output_end(&vpa_ctx->handle, n_read * sizeof(struct dtl_entry));
259 			}
260 			goto out;
261 		}
262 	}
263 
264 	/* .. and now the head */
265 	vpa_dtl_capture_aux(&n_req, aux_buf, dtl, i);
266 
267 	size = ((n_req + n_read) * sizeof(struct dtl_entry));
268 	if ((size +  aux_buf->head_size) > aux_buf->size) {
269 		size = aux_buf->size - aux_buf->head_size;
270 		perf_aux_output_end(&vpa_ctx->handle, size);
271 		aux_buf->head = 0;
272 		aux_buf->head_size = 0;
273 	} else {
274 		aux_buf->head_size += ((n_req + n_read) * sizeof(struct dtl_entry));
275 		/* Move the aux->head to indicate size of data in aux buffer */
276 		perf_aux_output_end(&vpa_ctx->handle, (n_req + n_read) * sizeof(struct dtl_entry));
277 	}
278 out:
279 	aux_buf->full = 0;
280 }
281 
282 /*
283  * The VPA Dispatch Trace log counters do not interrupt on overflow.
284  * Therefore, the kernel needs to poll the counters to avoid missing
285  * an overflow using hrtimer. The timer interval is based on sample_period
286  * count provided by user, and minimum interval is 1 millisecond.
287  */
288 static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
289 {
290 	struct perf_event *event;
291 	u64 period;
292 
293 	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
294 
295 	if (event->state != PERF_EVENT_STATE_ACTIVE)
296 		return HRTIMER_NORESTART;
297 
298 	vpa_dtl_dump_sample_data(event);
299 	period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period);
300 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
301 
302 	return HRTIMER_RESTART;
303 }
304 
305 static void vpa_dtl_start_hrtimer(struct perf_event *event)
306 {
307 	u64 period;
308 	struct hw_perf_event *hwc = &event->hw;
309 
310 	period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period);
311 	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
312 }
313 
314 static void vpa_dtl_stop_hrtimer(struct perf_event *event)
315 {
316 	struct hw_perf_event *hwc = &event->hw;
317 
318 	hrtimer_cancel(&hwc->hrtimer);
319 }
320 
321 static void vpa_dtl_reset_global_refc(struct perf_event *event)
322 {
323 	spin_lock(&dtl_global_lock);
324 	dtl_global_refc--;
325 	if (dtl_global_refc <= 0) {
326 		dtl_global_refc = 0;
327 		up_write(&dtl_access_lock);
328 	}
329 	spin_unlock(&dtl_global_lock);
330 }
331 
332 static int vpa_dtl_mem_alloc(int cpu)
333 {
334 	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu);
335 	struct dtl_entry *buf = NULL;
336 
337 	/* Check for dispatch trace log buffer cache */
338 	if (!dtl_cache)
339 		return -ENOMEM;
340 
341 	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL | GFP_ATOMIC, cpu_to_node(cpu));
342 	if (!buf) {
343 		pr_warn("buffer allocation failed for cpu %d\n", cpu);
344 		return -ENOMEM;
345 	}
346 	dtl->buf = buf;
347 	return 0;
348 }
349 
350 static int vpa_dtl_event_init(struct perf_event *event)
351 {
352 	struct hw_perf_event *hwc = &event->hw;
353 
354 	/* test the event attr type for PMU enumeration */
355 	if (event->attr.type != event->pmu->type)
356 		return -ENOENT;
357 
358 	if (!perfmon_capable())
359 		return -EACCES;
360 
361 	/* Return if this is a counting event */
362 	if (!is_sampling_event(event))
363 		return -EOPNOTSUPP;
364 
365 	/* no branch sampling */
366 	if (has_branch_stack(event))
367 		return -EOPNOTSUPP;
368 
369 	/* Invalid eventcode */
370 	switch (event->attr.config) {
371 	case DTL_LOG_CEDE:
372 	case DTL_LOG_PREEMPT:
373 	case DTL_LOG_FAULT:
374 	case DTL_LOG_ALL:
375 		break;
376 	default:
377 		return -EINVAL;
378 	}
379 
380 	spin_lock(&dtl_global_lock);
381 
382 	/*
383 	 * To ensure there are no other conflicting dtl users
384 	 * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl),
385 	 * below code try to take the dtl_access_lock.
386 	 * The dtl_access_lock is a rwlock defined in dtl.h, which is used
387 	 * to unsure there is no conflicting dtl users.
388 	 * Based on below code, vpa_dtl pmu tries to take write access lock
389 	 * and also checks for dtl_global_refc, to make sure that the
390 	 * dtl_access_lock is taken by vpa_dtl pmu interface.
391 	 */
392 	if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) {
393 		spin_unlock(&dtl_global_lock);
394 		return -EBUSY;
395 	}
396 
397 	/* Allocate dtl buffer memory */
398 	if (vpa_dtl_mem_alloc(event->cpu)) {
399 		spin_unlock(&dtl_global_lock);
400 		return -ENOMEM;
401 	}
402 
403 	/*
404 	 * Increment the number of active vpa_dtl pmu threads. The
405 	 * dtl_global_refc is used to keep count of cpu threads that
406 	 * currently capturing dtl data using vpa_dtl pmu interface.
407 	 */
408 	dtl_global_refc++;
409 
410 	spin_unlock(&dtl_global_lock);
411 
412 	hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
413 
414 	/*
415 	 * Since hrtimers have a fixed rate, we can do a static freq->period
416 	 * mapping and avoid the whole period adjust feedback stuff.
417 	 */
418 	if (event->attr.freq) {
419 		long freq = event->attr.sample_freq;
420 
421 		event->attr.sample_period = NSEC_PER_SEC / freq;
422 		hwc->sample_period = event->attr.sample_period;
423 		local64_set(&hwc->period_left, hwc->sample_period);
424 		hwc->last_period = hwc->sample_period;
425 		event->attr.freq = 0;
426 	}
427 
428 	event->destroy = vpa_dtl_reset_global_refc;
429 	return 0;
430 }
431 
432 static int vpa_dtl_event_add(struct perf_event *event, int flags)
433 {
434 	int ret, hwcpu;
435 	unsigned long addr;
436 	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
437 
438 	/*
439 	 * Register our dtl buffer with the hypervisor. The
440 	 * HV expects the buffer size to be passed in the second
441 	 * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA'
442 	 * from PAPR for more information.
443 	 */
444 	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
445 	dtl->last_idx = 0;
446 
447 	hwcpu = get_hard_smp_processor_id(event->cpu);
448 	addr = __pa(dtl->buf);
449 
450 	ret = register_dtl(hwcpu, addr);
451 	if (ret) {
452 		pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
453 			event->cpu, hwcpu, ret);
454 		return ret;
455 	}
456 
457 	/* set our initial buffer indices */
458 	lppaca_of(event->cpu).dtl_idx = 0;
459 
460 	/*
461 	 * Ensure that our updates to the lppaca fields have
462 	 * occurred before we actually enable the logging
463 	 */
464 	smp_wmb();
465 
466 	/* enable event logging */
467 	lppaca_of(event->cpu).dtl_enable_mask = event->attr.config;
468 
469 	vpa_dtl_start_hrtimer(event);
470 
471 	return 0;
472 }
473 
474 static void vpa_dtl_event_del(struct perf_event *event, int flags)
475 {
476 	int hwcpu = get_hard_smp_processor_id(event->cpu);
477 	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
478 
479 	vpa_dtl_stop_hrtimer(event);
480 	unregister_dtl(hwcpu);
481 	kmem_cache_free(dtl_cache, dtl->buf);
482 	dtl->buf = NULL;
483 	lppaca_of(event->cpu).dtl_enable_mask = 0x0;
484 }
485 
486 /*
487  * This function definition is empty as vpa_dtl_dump_sample_data
488  * is used to parse and dump the dispatch trace log data,
489  * to perf data.
490  */
491 static void vpa_dtl_event_read(struct perf_event *event)
492 {
493 }
494 
495 /*
496  * Set up pmu-private data structures for an AUX area
497  * **pages contains the aux buffer allocated for this event
498  * for the corresponding cpu. rb_alloc_aux uses "alloc_pages_node"
499  * and returns pointer to each page address. Map these pages to
500  * contiguous space using vmap and use that as base address.
501  *
502  * The aux private data structure ie, "struct vpa_pmu_buf" mainly
503  * saves
504  * - buf->base: aux buffer base address
505  * - buf->head: offset from base address where data will be written to.
506  * - buf->size: Size of allocated memory
507  */
508 static void *vpa_dtl_setup_aux(struct perf_event *event, void **pages,
509 		int nr_pages, bool snapshot)
510 {
511 	int i, cpu = event->cpu;
512 	struct vpa_pmu_buf *buf __free(kfree) = NULL;
513 	struct page **pglist __free(kfree) = NULL;
514 
515 	/* We need at least one page for this to work. */
516 	if (!nr_pages)
517 		return NULL;
518 
519 	if (cpu == -1)
520 		cpu = raw_smp_processor_id();
521 
522 	buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
523 	if (!buf)
524 		return NULL;
525 
526 	pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
527 	if (!pglist)
528 		return NULL;
529 
530 	for (i = 0; i < nr_pages; ++i)
531 		pglist[i] = virt_to_page(pages[i]);
532 
533 	buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
534 	if (!buf->base)
535 		return NULL;
536 
537 	buf->nr_pages = nr_pages;
538 	buf->snapshot = false;
539 
540 	buf->size = nr_pages << PAGE_SHIFT;
541 	buf->head = 0;
542 	buf->head_size = 0;
543 	buf->boottb_freq_saved = 0;
544 	buf->threshold = ((buf->size - 32) / sizeof(struct dtl_entry));
545 	return no_free_ptr(buf);
546 }
547 
548 /*
549  * free pmu-private AUX data structures
550  */
551 static void vpa_dtl_free_aux(void *aux)
552 {
553 	struct vpa_pmu_buf *buf = aux;
554 
555 	vunmap(buf->base);
556 	kfree(buf);
557 }
558 
559 static struct pmu vpa_dtl_pmu = {
560 	.task_ctx_nr = perf_invalid_context,
561 
562 	.name = "vpa_dtl",
563 	.attr_groups = attr_groups,
564 	.event_init  = vpa_dtl_event_init,
565 	.add         = vpa_dtl_event_add,
566 	.del         = vpa_dtl_event_del,
567 	.read        = vpa_dtl_event_read,
568 	.setup_aux   = vpa_dtl_setup_aux,
569 	.free_aux    = vpa_dtl_free_aux,
570 	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE,
571 };
572 
573 static int vpa_dtl_init(void)
574 {
575 	int r;
576 
577 	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
578 		pr_debug("not a shared virtualized system, not enabling\n");
579 		return -ENODEV;
580 	}
581 
582 	/* This driver is intended only for L1 host. */
583 	if (is_kvm_guest()) {
584 		pr_debug("Only supported for L1 host system\n");
585 		return -ENODEV;
586 	}
587 
588 	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1);
589 	if (r)
590 		return r;
591 
592 	return 0;
593 }
594 
595 device_initcall(vpa_dtl_init);
596 #endif //CONFIG_PPC_SPLPAR
597