xref: /freebsd/sys/arm64/spe/arm_spe_backend.c (revision 68f185ccc9f8f9498d536f4737d888b37cf11882)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2024 Arm Ltd
5  * Copyright (c) 2022 The FreeBSD Foundation
6  *
7  * Portions of this software were developed by Andrew Turner under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * Arm Statistical Profiling Extension (SPE) backend
34  *
35  * Basic SPE operation
36  *
37  *   SPE is enabled and configured on a per-core basis, with each core requiring
38  *   separate code to enable and configure. Each core also requires a separate
39  *   buffer passed as config where the CPU will write profiling data. When the
40  *   profiling buffer is full, an interrupt will be taken on the same CPU.
41  *
42  * Driver Design
43  *
44  * - HWT allocates a large single buffer per core. This buffer is split in half
45  *   to create a 2 element circular buffer (aka ping-pong buffer) where the
46  *   kernel writes to one half while userspace is copying the other half
47  * - SMP calls are used to enable and configure each core, with SPE initially
48  *   configured to write to the first half of the buffer
49  * - When the first half of the buffer is full, a buffer full interrupt will
50  *   immediately switch writing to the second half. The kernel adds the details
51  *   of the half that needs copying to a FIFO STAILQ and notifies userspace via
52  *   kqueue by sending a ARM_SPE_KQ_BUF kevent with how many buffers on the
53  *   queue need servicing
54  * - The kernel responds to HWT_IOC_BUFPTR_GET ioctl by sending details of the
55  *   first item from the queue
56  * - The buffers pending copying will not be overwritten until an
57  *   HWT_IOC_SVC_BUF ioctl is received from userspace confirming the data has
58  *   been copied out
59  * - In the case where both halfs of the buffer are full, profiling will be
60  *   paused until notification via HWT_IOC_SVC_BUF is received
61  *
62  * Future improvements and limitations
63  *
64  * - Using large buffer sizes should minimise pauses and loss of profiling
65  *   data while kernel is waiting for userspace to copy out data. Since it is
66  *   generally expected that consuming (copying) this data is faster than
67  *   producing it, in practice this has not so far been an issue. If it does
68  *   prove to be an issue even with large buffer sizes then additional buffering
69  *   i.e. n element circular buffers might be required.
70  *
71  * - kqueue can only notify and queue one kevent of the same type, with
72  *   subsequent events overwriting data in the first event. The kevent
73  *   ARM_SPE_KQ_BUF can therefore only contain the number of buffers on the
74  *   STAILQ, incrementing each time a new buffer is full. In this case kqueue
75  *   serves just as a notification to userspace to wake up and query the kernel
76  *   with the appropriate ioctl. An alternative might be custom kevents where
77  *   the kevent identifier is encoded with something like n+cpu_id or n+tid. In
78  *   this case data could be sent directly with kqueue via the kevent data and
79  *   fflags elements, avoiding the extra ioctl.
80  *
81  */
82 
83 #include <sys/param.h>
84 #include <sys/bus.h>
85 #include <sys/conf.h>
86 #include <sys/hwt.h>
87 #include <sys/kernel.h>
88 #include <sys/lock.h>
89 #include <sys/malloc.h>
90 #include <sys/mman.h>
91 #include <sys/module.h>
92 #include <sys/mutex.h>
93 #include <sys/proc.h>
94 #include <sys/rman.h>
95 #include <sys/rwlock.h>
96 #include <sys/smp.h>
97 #include <sys/sysctl.h>
98 #include <sys/systm.h>
99 
100 #include <machine/bus.h>
101 
102 #include <arm64/spe/arm_spe_dev.h>
103 
104 #include <dev/hwt/hwt_vm.h>
105 #include <dev/hwt/hwt_backend.h>
106 #include <dev/hwt/hwt_config.h>
107 #include <dev/hwt/hwt_context.h>
108 #include <dev/hwt/hwt_cpu.h>
109 #include <dev/hwt/hwt_thread.h>
110 
111 MALLOC_DECLARE(M_ARM_SPE);
112 
113 extern u_int mp_maxid;
114 extern struct taskqueue *taskqueue_arm_spe;
115 
116 int spe_backend_disable_smp(struct hwt_context *ctx);
117 
118 static device_t spe_dev;
119 static struct hwt_backend_ops spe_ops;
120 static struct hwt_backend backend = {
121 	.ops = &spe_ops,
122 	.name = "spe",
123 	.kva_req = 1,
124 };
125 
126 static struct arm_spe_info *spe_info;
127 
128 static int
spe_backend_init_thread(struct hwt_context * ctx)129 spe_backend_init_thread(struct hwt_context *ctx)
130 {
131 	return (ENOTSUP);
132 }
133 
134 static void
spe_backend_init_cpu(struct hwt_context * ctx)135 spe_backend_init_cpu(struct hwt_context *ctx)
136 {
137 	struct arm_spe_info *info;
138 	struct arm_spe_softc *sc = device_get_softc(spe_dev);
139 	char lock_name[32];
140 	char *tmp = "Arm SPE lock/cpu/";
141 	int cpu_id;
142 
143 	spe_info = malloc(sizeof(struct arm_spe_info) * mp_ncpus,
144 	   M_ARM_SPE, M_WAITOK | M_ZERO);
145 
146 	sc->spe_info = spe_info;
147 
148 	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
149 		info = &spe_info[cpu_id];
150 		info->sc = sc;
151 		info->ident = cpu_id;
152 		info->buf_info[0].info = info;
153 		info->buf_info[0].buf_idx = 0;
154 		info->buf_info[1].info = info;
155 		info->buf_info[1].buf_idx = 1;
156 		snprintf(lock_name, sizeof(lock_name), "%s%d", tmp, cpu_id);
157 		mtx_init(&info->lock, lock_name, NULL, MTX_SPIN);
158 	}
159 }
160 
161 static int
spe_backend_init(struct hwt_context * ctx)162 spe_backend_init(struct hwt_context *ctx)
163 {
164 	struct arm_spe_softc *sc = device_get_softc(spe_dev);
165 	int error = 0;
166 
167 	/*
168 	 * HWT currently specifies buffer size must be a multiple of PAGE_SIZE,
169 	 * i.e. minimum 4KB + the maximum PMBIDR.Align is 2KB
170 	 * This should never happen but it's good to sense check
171 	 */
172 	if (ctx->bufsize % sc->kva_align != 0)
173 		return (EINVAL);
174 
175 	/*
176 	 * Since we're splitting the buffer in half + PMBLIMITR needs to be page
177 	 * aligned, minimum buffer size needs to be 2x PAGE_SIZE
178 	 */
179 	if (ctx->bufsize < (2 * PAGE_SIZE))
180 		return (EINVAL);
181 
182 	sc->ctx = ctx;
183 	sc->kqueue_fd = ctx->kqueue_fd;
184 	sc->hwt_td = ctx->hwt_td;
185 
186 	if (ctx->mode == HWT_MODE_THREAD)
187 		error = spe_backend_init_thread(ctx);
188 	else
189 		spe_backend_init_cpu(ctx);
190 
191 	return (error);
192 }
193 
194 #ifdef ARM_SPE_DEBUG
hex_dump(uint8_t * buf,size_t len)195 static void hex_dump(uint8_t *buf, size_t len)
196 {
197 	size_t i;
198 
199 	printf("--------------------------------------------------------------\n");
200 	for (i = 0; i < len; ++i) {
201 		if (i % 8 == 0) {
202 			printf(" ");
203 		}
204 		if (i % 16 == 0) {
205 			if (i != 0) {
206 				printf("\r\n");
207 			}
208 			printf("\t");
209 		}
210 		printf("%02X ", buf[i]);
211 	}
212 	printf("\r\n");
213 }
214 #endif
215 
216 static int
spe_backend_deinit(struct hwt_context * ctx)217 spe_backend_deinit(struct hwt_context *ctx)
218 {
219 #ifdef ARM_SPE_DEBUG
220 	struct arm_spe_info *info;
221 	int cpu_id;
222 
223 	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
224 		info = &spe_info[cpu_id];
225 		hex_dump((void *)info->kvaddr, 128);
226 		hex_dump((void *)(info->kvaddr + (info->buf_size/2)), 128);
227 	}
228 #endif
229 
230 	if (ctx->state == CTX_STATE_RUNNING) {
231 		spe_backend_disable_smp(ctx);
232 		ctx->state = CTX_STATE_STOPPED;
233 	}
234 
235 	free(spe_info, M_ARM_SPE);
236 
237 	return (0);
238 }
239 
240 static uint64_t
arm_spe_min_interval(struct arm_spe_softc * sc)241 arm_spe_min_interval(struct arm_spe_softc *sc)
242 {
243 	/* IMPLEMENTATION DEFINED */
244 	switch (PMSIDR_Interval_VAL(sc->pmsidr))
245 	{
246 	case PMSIDR_Interval_256:
247 		return (256);
248 	case PMSIDR_Interval_512:
249 		return (512);
250 	case PMSIDR_Interval_768:
251 		return (768);
252 	case PMSIDR_Interval_1024:
253 		return (1024);
254 	case PMSIDR_Interval_1536:
255 		return (1536);
256 	case PMSIDR_Interval_2048:
257 		return (2048);
258 	case PMSIDR_Interval_3072:
259 		return (3072);
260 	case PMSIDR_Interval_4096:
261 		return (4096);
262 	default:
263 		return (4096);
264 	}
265 }
266 
267 static inline void
arm_spe_set_interval(struct arm_spe_info * info,uint64_t interval)268 arm_spe_set_interval(struct arm_spe_info *info, uint64_t interval)
269 {
270 	uint64_t min_interval = arm_spe_min_interval(info->sc);
271 
272 	interval = MAX(interval, min_interval);
273 	interval = MIN(interval, 1 << 24);      /* max 24 bits */
274 
275 	dprintf("%s %lu\n", __func__, interval);
276 
277 	info->pmsirr &= ~(PMSIRR_INTERVAL_MASK);
278 	info->pmsirr |= (interval << PMSIRR_INTERVAL_SHIFT);
279 }
280 
281 static int
spe_backend_configure(struct hwt_context * ctx,int cpu_id,int session_id)282 spe_backend_configure(struct hwt_context *ctx, int cpu_id, int session_id)
283 {
284 	struct arm_spe_info *info = &spe_info[cpu_id];
285 	struct arm_spe_config *cfg;
286 	int err = 0;
287 
288 	mtx_lock_spin(&info->lock);
289 	info->ident = cpu_id;
290 	/* Set defaults */
291 	info->pmsfcr = 0;
292 	info->pmsevfr = 0xFFFFFFFFFFFFFFFFUL;
293 	info->pmslatfr = 0;
294 	info->pmsirr =
295 	    (arm_spe_min_interval(info->sc) << PMSIRR_INTERVAL_SHIFT)
296 	    | PMSIRR_RND;
297 	info->pmsicr = 0;
298 	info->pmscr = PMSCR_TS | PMSCR_PA | PMSCR_CX | PMSCR_E1SPE | PMSCR_E0SPE;
299 
300 	if (ctx->config != NULL &&
301 	    ctx->config_size == sizeof(struct arm_spe_config) &&
302 	    ctx->config_version == 1) {
303 		cfg = (struct arm_spe_config *)ctx->config;
304 		if (cfg->interval)
305 			arm_spe_set_interval(info, cfg->interval);
306 		if (cfg->level == ARM_SPE_KERNEL_ONLY)
307 			info->pmscr &= ~(PMSCR_E0SPE); /* turn off user */
308 		if (cfg->level == ARM_SPE_USER_ONLY)
309 			info->pmscr &= ~(PMSCR_E1SPE); /* turn off kern */
310 		if (cfg->ctx_field)
311 			info->ctx_field = cfg->ctx_field;
312 	} else
313 		err = (EINVAL);
314 	mtx_unlock_spin(&info->lock);
315 
316 	return (err);
317 }
318 
319 
320 static void
arm_spe_enable(void * arg __unused)321 arm_spe_enable(void *arg __unused)
322 {
323 	struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];
324 	uint64_t base, limit;
325 
326 	dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));
327 
328 	mtx_lock_spin(&info->lock);
329 
330 	if (info->ctx_field == ARM_SPE_CTX_CPU_ID)
331 		WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, PCPU_GET(cpuid));
332 
333 	WRITE_SPECIALREG(PMSFCR_EL1_REG, info->pmsfcr);
334 	WRITE_SPECIALREG(PMSEVFR_EL1_REG, info->pmsevfr);
335 	WRITE_SPECIALREG(PMSLATFR_EL1_REG, info->pmslatfr);
336 
337 	/* Set the sampling interval */
338 	WRITE_SPECIALREG(PMSIRR_EL1_REG, info->pmsirr);
339 	isb();
340 
341 	/* Write 0 here before enabling sampling */
342 	WRITE_SPECIALREG(PMSICR_EL1_REG, info->pmsicr);
343 	isb();
344 
345 	base = info->kvaddr;
346 	limit = base + (info->buf_size/2);
347 	/* Enable the buffer */
348 	limit &= PMBLIMITR_LIMIT_MASK; /* Zero lower 12 bits */
349 	limit |= PMBLIMITR_E;
350 	/* Set the base and limit */
351 	WRITE_SPECIALREG(PMBPTR_EL1_REG, base);
352 	WRITE_SPECIALREG(PMBLIMITR_EL1_REG, limit);
353 	isb();
354 
355 	/* Enable sampling */
356 	WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
357 	isb();
358 
359 	info->enabled = true;
360 
361 	mtx_unlock_spin(&info->lock);
362 }
363 
364 static int
spe_backend_enable_smp(struct hwt_context * ctx)365 spe_backend_enable_smp(struct hwt_context *ctx)
366 {
367 	struct arm_spe_info *info;
368 	struct hwt_vm *vm;
369 	int cpu_id;
370 
371 	HWT_CTX_LOCK(ctx);
372 	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
373 		vm = hwt_cpu_get(ctx, cpu_id)->vm;
374 
375 		info = &spe_info[cpu_id];
376 
377 		mtx_lock_spin(&info->lock);
378 		info->kvaddr = vm->kvaddr;
379 		info->buf_size = ctx->bufsize;
380 		mtx_unlock_spin(&info->lock);
381 	}
382 	HWT_CTX_UNLOCK(ctx);
383 
384 	cpu_id = CPU_FFS(&ctx->cpu_map) - 1;
385 	info = &spe_info[cpu_id];
386 	if (info->ctx_field == ARM_SPE_CTX_PID)
387 		arm64_pid_in_contextidr = true;
388 	else
389 		arm64_pid_in_contextidr = false;
390 
391 	smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
392 	    arm_spe_enable, smp_no_rendezvous_barrier, NULL);
393 
394 	return (0);
395 }
396 
397 void
arm_spe_disable(void * arg __unused)398 arm_spe_disable(void *arg __unused)
399 {
400 	struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];
401 	struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx];
402 
403 	if (!info->enabled)
404 		return;
405 
406 	dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));
407 
408 	/* Disable profiling */
409 	WRITE_SPECIALREG(PMSCR_EL1_REG, 0x0);
410 	isb();
411 
412 	/* Drain any remaining tracing data */
413 	psb_csync();
414 	dsb(nsh);
415 
416 	/* Disable the profiling buffer */
417 	WRITE_SPECIALREG(PMBLIMITR_EL1_REG, 0);
418 	isb();
419 
420 	/* Clear interrupt status reg */
421 	WRITE_SPECIALREG(PMBSR_EL1_REG, 0x0);
422 
423 	/* Clear PID/CPU_ID from context ID reg */
424 	WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, 0);
425 
426 	mtx_lock_spin(&info->lock);
427 	buf->pmbptr = READ_SPECIALREG(PMBPTR_EL1_REG);
428 	info->enabled = false;
429 	mtx_unlock_spin(&info->lock);
430 }
431 
432 int
spe_backend_disable_smp(struct hwt_context * ctx)433 spe_backend_disable_smp(struct hwt_context *ctx)
434 {
435 	struct kevent kev;
436 	struct arm_spe_info *info;
437 	struct arm_spe_buf_info *buf;
438 	int cpu_id;
439 	int ret;
440 
441 	/* Disable and send out remaining data in bufs */
442 	smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
443 	    arm_spe_disable, smp_no_rendezvous_barrier, NULL);
444 
445 	CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
446 		info = &spe_info[cpu_id];
447 		buf = &info->buf_info[info->buf_idx];
448 		arm_spe_send_buffer(buf, 0);
449 	}
450 
451 	arm64_pid_in_contextidr = false;
452 
453 	/*
454 	 * Tracing on all CPUs has been disabled, and we've sent write ptr
455 	 * offsets for all bufs - let userspace know it can shutdown
456 	 */
457 	EV_SET(&kev, ARM_SPE_KQ_SHUTDOWN, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL);
458 	ret = kqfd_register(ctx->kqueue_fd, &kev, ctx->hwt_td, M_WAITOK);
459 	if (ret)
460 		dprintf("%s kqfd_register ret:%d\n", __func__, ret);
461 
462 	return (0);
463 }
464 
465 static void
spe_backend_stop(struct hwt_context * ctx)466 spe_backend_stop(struct hwt_context *ctx)
467 {
468 	spe_backend_disable_smp(ctx);
469 }
470 
471 static void
arm_spe_reenable(void * arg __unused)472 arm_spe_reenable(void *arg __unused)
473 {
474 	struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];;
475 
476 	WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
477 	isb();
478 }
479 
480 static int
spe_backend_svc_buf(struct hwt_context * ctx,void * data,size_t data_size,int data_version)481 spe_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size,
482     int data_version)
483 {
484 	struct arm_spe_info *info;
485 	struct arm_spe_buf_info *buf;
486 	struct arm_spe_svc_buf *s;
487 	int err = 0;
488 	cpuset_t cpu_set;
489 
490 	if (data_size != sizeof(struct arm_spe_svc_buf))
491 		return (E2BIG);
492 
493 	if (data_version != 1)
494 		return (EINVAL);
495 
496 	s = (struct arm_spe_svc_buf *)data;
497 	if (s->buf_idx > 1)
498 		return (ENODEV);
499 	if (s->ident >= mp_ncpus)
500 		return (EINVAL);
501 
502 	info = &spe_info[s->ident];
503 	mtx_lock_spin(&info->lock);
504 
505 	buf = &info->buf_info[s->buf_idx];
506 
507 	if (!info->enabled) {
508 		err = ENXIO;
509 		goto end;
510 	}
511 
512 	/* Clear the flag the signals buffer needs servicing */
513 	buf->buf_svc = false;
514 
515 	/* Re-enable profiling if we've been waiting for this notification */
516 	if (buf->buf_wait) {
517 		CPU_SETOF(s->ident, &cpu_set);
518 
519 		mtx_unlock_spin(&info->lock);
520 		smp_rendezvous_cpus(cpu_set, smp_no_rendezvous_barrier,
521 		    arm_spe_reenable, smp_no_rendezvous_barrier, NULL);
522 		mtx_lock_spin(&info->lock);
523 
524 		buf->buf_wait = false;
525 	}
526 
527 end:
528 	mtx_unlock_spin(&info->lock);
529 	return (err);
530 }
531 
532 static int
spe_backend_read(struct hwt_vm * vm,int * ident,vm_offset_t * offset,uint64_t * data)533 spe_backend_read(struct hwt_vm *vm, int *ident, vm_offset_t *offset,
534     uint64_t *data)
535 {
536 	struct arm_spe_queue *q;
537 	struct arm_spe_softc *sc = device_get_softc(spe_dev);
538 	int error = 0;
539 
540 	mtx_lock_spin(&sc->sc_lock);
541 
542 	/* Return the first pending buffer that needs servicing */
543 	q = STAILQ_FIRST(&sc->pending);
544 	if (q == NULL) {
545 		error = ENOENT;
546 		goto error;
547 	}
548 	*ident = q->ident;
549 	*offset = q->offset;
550 	*data = (q->buf_idx << KQ_BUF_POS_SHIFT) |
551 	    (q->partial_rec << KQ_PARTREC_SHIFT) |
552 	    (q->final_buf << KQ_FINAL_BUF_SHIFT);
553 
554 	STAILQ_REMOVE_HEAD(&sc->pending, next);
555 	sc->npending--;
556 
557 error:
558 	mtx_unlock_spin(&sc->sc_lock);
559 	if (error)
560 		return (error);
561 
562 	free(q, M_ARM_SPE);
563 	return (0);
564 }
565 
566 static struct hwt_backend_ops spe_ops = {
567 	.hwt_backend_init = spe_backend_init,
568 	.hwt_backend_deinit = spe_backend_deinit,
569 
570 	.hwt_backend_configure = spe_backend_configure,
571 	.hwt_backend_svc_buf = spe_backend_svc_buf,
572 	.hwt_backend_stop = spe_backend_stop,
573 
574 	.hwt_backend_enable_smp = spe_backend_enable_smp,
575 	.hwt_backend_disable_smp = spe_backend_disable_smp,
576 
577 	.hwt_backend_read = spe_backend_read,
578 };
579 
580 int
spe_register(device_t dev)581 spe_register(device_t dev)
582 {
583 	spe_dev = dev;
584 
585 	return (hwt_backend_register(&backend));
586 }
587