1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Arm Ltd
5 * Copyright (c) 2022 The FreeBSD Foundation
6 *
7 * Portions of this software were developed by Andrew Turner under sponsorship
8 * from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Arm Statistical Profiling Extension (SPE) backend
34 *
35 * Basic SPE operation
36 *
37 * SPE is enabled and configured on a per-core basis, with each core requiring
38 * separate code to enable and configure. Each core also requires a separate
39 * buffer passed as config where the CPU will write profiling data. When the
40 * profiling buffer is full, an interrupt will be taken on the same CPU.
41 *
42 * Driver Design
43 *
44 * - HWT allocates a large single buffer per core. This buffer is split in half
45 * to create a 2 element circular buffer (aka ping-pong buffer) where the
46 * kernel writes to one half while userspace is copying the other half
47 * - SMP calls are used to enable and configure each core, with SPE initially
48 * configured to write to the first half of the buffer
49 * - When the first half of the buffer is full, a buffer full interrupt will
50 * immediately switch writing to the second half. The kernel adds the details
51 * of the half that needs copying to a FIFO STAILQ and notifies userspace via
52 * kqueue by sending a ARM_SPE_KQ_BUF kevent with how many buffers on the
53 * queue need servicing
54 * - The kernel responds to HWT_IOC_BUFPTR_GET ioctl by sending details of the
55 * first item from the queue
56 * - The buffers pending copying will not be overwritten until an
57 * HWT_IOC_SVC_BUF ioctl is received from userspace confirming the data has
58 * been copied out
59 * - In the case where both halfs of the buffer are full, profiling will be
60 * paused until notification via HWT_IOC_SVC_BUF is received
61 *
62 * Future improvements and limitations
63 *
64 * - Using large buffer sizes should minimise pauses and loss of profiling
65 * data while kernel is waiting for userspace to copy out data. Since it is
66 * generally expected that consuming (copying) this data is faster than
67 * producing it, in practice this has not so far been an issue. If it does
68 * prove to be an issue even with large buffer sizes then additional buffering
69 * i.e. n element circular buffers might be required.
70 *
71 * - kqueue can only notify and queue one kevent of the same type, with
72 * subsequent events overwriting data in the first event. The kevent
73 * ARM_SPE_KQ_BUF can therefore only contain the number of buffers on the
74 * STAILQ, incrementing each time a new buffer is full. In this case kqueue
75 * serves just as a notification to userspace to wake up and query the kernel
76 * with the appropriate ioctl. An alternative might be custom kevents where
77 * the kevent identifier is encoded with something like n+cpu_id or n+tid. In
78 * this case data could be sent directly with kqueue via the kevent data and
79 * fflags elements, avoiding the extra ioctl.
80 *
81 */
82
83 #include <sys/param.h>
84 #include <sys/bus.h>
85 #include <sys/conf.h>
86 #include <sys/hwt.h>
87 #include <sys/kernel.h>
88 #include <sys/lock.h>
89 #include <sys/malloc.h>
90 #include <sys/mman.h>
91 #include <sys/module.h>
92 #include <sys/mutex.h>
93 #include <sys/proc.h>
94 #include <sys/rman.h>
95 #include <sys/rwlock.h>
96 #include <sys/smp.h>
97 #include <sys/sysctl.h>
98 #include <sys/systm.h>
99
100 #include <machine/bus.h>
101
102 #include <arm64/spe/arm_spe_dev.h>
103
104 #include <dev/hwt/hwt_vm.h>
105 #include <dev/hwt/hwt_backend.h>
106 #include <dev/hwt/hwt_config.h>
107 #include <dev/hwt/hwt_context.h>
108 #include <dev/hwt/hwt_cpu.h>
109 #include <dev/hwt/hwt_thread.h>
110
111 MALLOC_DECLARE(M_ARM_SPE);
112
113 extern u_int mp_maxid;
114 extern struct taskqueue *taskqueue_arm_spe;
115
116 int spe_backend_disable_smp(struct hwt_context *ctx);
117
118 static device_t spe_dev;
119 static struct hwt_backend_ops spe_ops;
120 static struct hwt_backend backend = {
121 .ops = &spe_ops,
122 .name = "spe",
123 .kva_req = 1,
124 };
125
126 static struct arm_spe_info *spe_info;
127
128 static int
spe_backend_init_thread(struct hwt_context * ctx)129 spe_backend_init_thread(struct hwt_context *ctx)
130 {
131 return (ENOTSUP);
132 }
133
134 static void
spe_backend_init_cpu(struct hwt_context * ctx)135 spe_backend_init_cpu(struct hwt_context *ctx)
136 {
137 struct arm_spe_info *info;
138 struct arm_spe_softc *sc = device_get_softc(spe_dev);
139 char lock_name[32];
140 char *tmp = "Arm SPE lock/cpu/";
141 int cpu_id;
142
143 spe_info = malloc(sizeof(struct arm_spe_info) * mp_ncpus,
144 M_ARM_SPE, M_WAITOK | M_ZERO);
145
146 sc->spe_info = spe_info;
147
148 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
149 info = &spe_info[cpu_id];
150 info->sc = sc;
151 info->ident = cpu_id;
152 info->buf_info[0].info = info;
153 info->buf_info[0].buf_idx = 0;
154 info->buf_info[1].info = info;
155 info->buf_info[1].buf_idx = 1;
156 snprintf(lock_name, sizeof(lock_name), "%s%d", tmp, cpu_id);
157 mtx_init(&info->lock, lock_name, NULL, MTX_SPIN);
158 }
159 }
160
161 static int
spe_backend_init(struct hwt_context * ctx)162 spe_backend_init(struct hwt_context *ctx)
163 {
164 struct arm_spe_softc *sc = device_get_softc(spe_dev);
165 int error = 0;
166
167 /*
168 * HWT currently specifies buffer size must be a multiple of PAGE_SIZE,
169 * i.e. minimum 4KB + the maximum PMBIDR.Align is 2KB
170 * This should never happen but it's good to sense check
171 */
172 if (ctx->bufsize % sc->kva_align != 0)
173 return (EINVAL);
174
175 /*
176 * Since we're splitting the buffer in half + PMBLIMITR needs to be page
177 * aligned, minimum buffer size needs to be 2x PAGE_SIZE
178 */
179 if (ctx->bufsize < (2 * PAGE_SIZE))
180 return (EINVAL);
181
182 sc->ctx = ctx;
183 sc->kqueue_fd = ctx->kqueue_fd;
184 sc->hwt_td = ctx->hwt_td;
185
186 if (ctx->mode == HWT_MODE_THREAD)
187 error = spe_backend_init_thread(ctx);
188 else
189 spe_backend_init_cpu(ctx);
190
191 return (error);
192 }
193
194 #ifdef ARM_SPE_DEBUG
hex_dump(uint8_t * buf,size_t len)195 static void hex_dump(uint8_t *buf, size_t len)
196 {
197 size_t i;
198
199 printf("--------------------------------------------------------------\n");
200 for (i = 0; i < len; ++i) {
201 if (i % 8 == 0) {
202 printf(" ");
203 }
204 if (i % 16 == 0) {
205 if (i != 0) {
206 printf("\r\n");
207 }
208 printf("\t");
209 }
210 printf("%02X ", buf[i]);
211 }
212 printf("\r\n");
213 }
214 #endif
215
216 static int
spe_backend_deinit(struct hwt_context * ctx)217 spe_backend_deinit(struct hwt_context *ctx)
218 {
219 #ifdef ARM_SPE_DEBUG
220 struct arm_spe_info *info;
221 int cpu_id;
222
223 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
224 info = &spe_info[cpu_id];
225 hex_dump((void *)info->kvaddr, 128);
226 hex_dump((void *)(info->kvaddr + (info->buf_size/2)), 128);
227 }
228 #endif
229
230 if (ctx->state == CTX_STATE_RUNNING) {
231 spe_backend_disable_smp(ctx);
232 ctx->state = CTX_STATE_STOPPED;
233 }
234
235 free(spe_info, M_ARM_SPE);
236
237 return (0);
238 }
239
240 static uint64_t
arm_spe_min_interval(struct arm_spe_softc * sc)241 arm_spe_min_interval(struct arm_spe_softc *sc)
242 {
243 /* IMPLEMENTATION DEFINED */
244 switch (PMSIDR_Interval_VAL(sc->pmsidr))
245 {
246 case PMSIDR_Interval_256:
247 return (256);
248 case PMSIDR_Interval_512:
249 return (512);
250 case PMSIDR_Interval_768:
251 return (768);
252 case PMSIDR_Interval_1024:
253 return (1024);
254 case PMSIDR_Interval_1536:
255 return (1536);
256 case PMSIDR_Interval_2048:
257 return (2048);
258 case PMSIDR_Interval_3072:
259 return (3072);
260 case PMSIDR_Interval_4096:
261 return (4096);
262 default:
263 return (4096);
264 }
265 }
266
267 static inline void
arm_spe_set_interval(struct arm_spe_info * info,uint64_t interval)268 arm_spe_set_interval(struct arm_spe_info *info, uint64_t interval)
269 {
270 uint64_t min_interval = arm_spe_min_interval(info->sc);
271
272 interval = MAX(interval, min_interval);
273 interval = MIN(interval, 1 << 24); /* max 24 bits */
274
275 dprintf("%s %lu\n", __func__, interval);
276
277 info->pmsirr &= ~(PMSIRR_INTERVAL_MASK);
278 info->pmsirr |= (interval << PMSIRR_INTERVAL_SHIFT);
279 }
280
281 static int
spe_backend_configure(struct hwt_context * ctx,int cpu_id,int session_id)282 spe_backend_configure(struct hwt_context *ctx, int cpu_id, int session_id)
283 {
284 struct arm_spe_info *info = &spe_info[cpu_id];
285 struct arm_spe_config *cfg;
286 int err = 0;
287
288 mtx_lock_spin(&info->lock);
289 info->ident = cpu_id;
290 /* Set defaults */
291 info->pmsfcr = 0;
292 info->pmsevfr = 0xFFFFFFFFFFFFFFFFUL;
293 info->pmslatfr = 0;
294 info->pmsirr =
295 (arm_spe_min_interval(info->sc) << PMSIRR_INTERVAL_SHIFT)
296 | PMSIRR_RND;
297 info->pmsicr = 0;
298 info->pmscr = PMSCR_TS | PMSCR_PA | PMSCR_CX | PMSCR_E1SPE | PMSCR_E0SPE;
299
300 if (ctx->config != NULL &&
301 ctx->config_size == sizeof(struct arm_spe_config) &&
302 ctx->config_version == 1) {
303 cfg = (struct arm_spe_config *)ctx->config;
304 if (cfg->interval)
305 arm_spe_set_interval(info, cfg->interval);
306 if (cfg->level == ARM_SPE_KERNEL_ONLY)
307 info->pmscr &= ~(PMSCR_E0SPE); /* turn off user */
308 if (cfg->level == ARM_SPE_USER_ONLY)
309 info->pmscr &= ~(PMSCR_E1SPE); /* turn off kern */
310 if (cfg->ctx_field)
311 info->ctx_field = cfg->ctx_field;
312 } else
313 err = (EINVAL);
314 mtx_unlock_spin(&info->lock);
315
316 return (err);
317 }
318
319
320 static void
arm_spe_enable(void * arg __unused)321 arm_spe_enable(void *arg __unused)
322 {
323 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];
324 uint64_t base, limit;
325
326 dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));
327
328 mtx_lock_spin(&info->lock);
329
330 if (info->ctx_field == ARM_SPE_CTX_CPU_ID)
331 WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, PCPU_GET(cpuid));
332
333 WRITE_SPECIALREG(PMSFCR_EL1_REG, info->pmsfcr);
334 WRITE_SPECIALREG(PMSEVFR_EL1_REG, info->pmsevfr);
335 WRITE_SPECIALREG(PMSLATFR_EL1_REG, info->pmslatfr);
336
337 /* Set the sampling interval */
338 WRITE_SPECIALREG(PMSIRR_EL1_REG, info->pmsirr);
339 isb();
340
341 /* Write 0 here before enabling sampling */
342 WRITE_SPECIALREG(PMSICR_EL1_REG, info->pmsicr);
343 isb();
344
345 base = info->kvaddr;
346 limit = base + (info->buf_size/2);
347 /* Enable the buffer */
348 limit &= PMBLIMITR_LIMIT_MASK; /* Zero lower 12 bits */
349 limit |= PMBLIMITR_E;
350 /* Set the base and limit */
351 WRITE_SPECIALREG(PMBPTR_EL1_REG, base);
352 WRITE_SPECIALREG(PMBLIMITR_EL1_REG, limit);
353 isb();
354
355 /* Enable sampling */
356 WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
357 isb();
358
359 info->enabled = true;
360
361 mtx_unlock_spin(&info->lock);
362 }
363
364 static int
spe_backend_enable_smp(struct hwt_context * ctx)365 spe_backend_enable_smp(struct hwt_context *ctx)
366 {
367 struct arm_spe_info *info;
368 struct hwt_vm *vm;
369 int cpu_id;
370
371 HWT_CTX_LOCK(ctx);
372 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
373 vm = hwt_cpu_get(ctx, cpu_id)->vm;
374
375 info = &spe_info[cpu_id];
376
377 mtx_lock_spin(&info->lock);
378 info->kvaddr = vm->kvaddr;
379 info->buf_size = ctx->bufsize;
380 mtx_unlock_spin(&info->lock);
381 }
382 HWT_CTX_UNLOCK(ctx);
383
384 cpu_id = CPU_FFS(&ctx->cpu_map) - 1;
385 info = &spe_info[cpu_id];
386 if (info->ctx_field == ARM_SPE_CTX_PID)
387 arm64_pid_in_contextidr = true;
388 else
389 arm64_pid_in_contextidr = false;
390
391 smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
392 arm_spe_enable, smp_no_rendezvous_barrier, NULL);
393
394 return (0);
395 }
396
397 void
arm_spe_disable(void * arg __unused)398 arm_spe_disable(void *arg __unused)
399 {
400 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];
401 struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx];
402
403 if (!info->enabled)
404 return;
405
406 dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));
407
408 /* Disable profiling */
409 WRITE_SPECIALREG(PMSCR_EL1_REG, 0x0);
410 isb();
411
412 /* Drain any remaining tracing data */
413 psb_csync();
414 dsb(nsh);
415
416 /* Disable the profiling buffer */
417 WRITE_SPECIALREG(PMBLIMITR_EL1_REG, 0);
418 isb();
419
420 /* Clear interrupt status reg */
421 WRITE_SPECIALREG(PMBSR_EL1_REG, 0x0);
422
423 /* Clear PID/CPU_ID from context ID reg */
424 WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, 0);
425
426 mtx_lock_spin(&info->lock);
427 buf->pmbptr = READ_SPECIALREG(PMBPTR_EL1_REG);
428 info->enabled = false;
429 mtx_unlock_spin(&info->lock);
430 }
431
432 int
spe_backend_disable_smp(struct hwt_context * ctx)433 spe_backend_disable_smp(struct hwt_context *ctx)
434 {
435 struct kevent kev;
436 struct arm_spe_info *info;
437 struct arm_spe_buf_info *buf;
438 int cpu_id;
439 int ret;
440
441 /* Disable and send out remaining data in bufs */
442 smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
443 arm_spe_disable, smp_no_rendezvous_barrier, NULL);
444
445 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
446 info = &spe_info[cpu_id];
447 buf = &info->buf_info[info->buf_idx];
448 arm_spe_send_buffer(buf, 0);
449 }
450
451 arm64_pid_in_contextidr = false;
452
453 /*
454 * Tracing on all CPUs has been disabled, and we've sent write ptr
455 * offsets for all bufs - let userspace know it can shutdown
456 */
457 EV_SET(&kev, ARM_SPE_KQ_SHUTDOWN, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL);
458 ret = kqfd_register(ctx->kqueue_fd, &kev, ctx->hwt_td, M_WAITOK);
459 if (ret)
460 dprintf("%s kqfd_register ret:%d\n", __func__, ret);
461
462 return (0);
463 }
464
465 static void
spe_backend_stop(struct hwt_context * ctx)466 spe_backend_stop(struct hwt_context *ctx)
467 {
468 spe_backend_disable_smp(ctx);
469 }
470
471 static void
arm_spe_reenable(void * arg __unused)472 arm_spe_reenable(void *arg __unused)
473 {
474 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];;
475
476 WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
477 isb();
478 }
479
480 static int
spe_backend_svc_buf(struct hwt_context * ctx,void * data,size_t data_size,int data_version)481 spe_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size,
482 int data_version)
483 {
484 struct arm_spe_info *info;
485 struct arm_spe_buf_info *buf;
486 struct arm_spe_svc_buf *s;
487 int err = 0;
488 cpuset_t cpu_set;
489
490 if (data_size != sizeof(struct arm_spe_svc_buf))
491 return (E2BIG);
492
493 if (data_version != 1)
494 return (EINVAL);
495
496 s = (struct arm_spe_svc_buf *)data;
497 if (s->buf_idx > 1)
498 return (ENODEV);
499 if (s->ident >= mp_ncpus)
500 return (EINVAL);
501
502 info = &spe_info[s->ident];
503 mtx_lock_spin(&info->lock);
504
505 buf = &info->buf_info[s->buf_idx];
506
507 if (!info->enabled) {
508 err = ENXIO;
509 goto end;
510 }
511
512 /* Clear the flag the signals buffer needs servicing */
513 buf->buf_svc = false;
514
515 /* Re-enable profiling if we've been waiting for this notification */
516 if (buf->buf_wait) {
517 CPU_SETOF(s->ident, &cpu_set);
518
519 mtx_unlock_spin(&info->lock);
520 smp_rendezvous_cpus(cpu_set, smp_no_rendezvous_barrier,
521 arm_spe_reenable, smp_no_rendezvous_barrier, NULL);
522 mtx_lock_spin(&info->lock);
523
524 buf->buf_wait = false;
525 }
526
527 end:
528 mtx_unlock_spin(&info->lock);
529 return (err);
530 }
531
532 static int
spe_backend_read(struct hwt_vm * vm,int * ident,vm_offset_t * offset,uint64_t * data)533 spe_backend_read(struct hwt_vm *vm, int *ident, vm_offset_t *offset,
534 uint64_t *data)
535 {
536 struct arm_spe_queue *q;
537 struct arm_spe_softc *sc = device_get_softc(spe_dev);
538 int error = 0;
539
540 mtx_lock_spin(&sc->sc_lock);
541
542 /* Return the first pending buffer that needs servicing */
543 q = STAILQ_FIRST(&sc->pending);
544 if (q == NULL) {
545 error = ENOENT;
546 goto error;
547 }
548 *ident = q->ident;
549 *offset = q->offset;
550 *data = (q->buf_idx << KQ_BUF_POS_SHIFT) |
551 (q->partial_rec << KQ_PARTREC_SHIFT) |
552 (q->final_buf << KQ_FINAL_BUF_SHIFT);
553
554 STAILQ_REMOVE_HEAD(&sc->pending, next);
555 sc->npending--;
556
557 error:
558 mtx_unlock_spin(&sc->sc_lock);
559 if (error)
560 return (error);
561
562 free(q, M_ARM_SPE);
563 return (0);
564 }
565
566 static struct hwt_backend_ops spe_ops = {
567 .hwt_backend_init = spe_backend_init,
568 .hwt_backend_deinit = spe_backend_deinit,
569
570 .hwt_backend_configure = spe_backend_configure,
571 .hwt_backend_svc_buf = spe_backend_svc_buf,
572 .hwt_backend_stop = spe_backend_stop,
573
574 .hwt_backend_enable_smp = spe_backend_enable_smp,
575 .hwt_backend_disable_smp = spe_backend_disable_smp,
576
577 .hwt_backend_read = spe_backend_read,
578 };
579
580 int
spe_register(device_t dev)581 spe_register(device_t dev)
582 {
583 spe_dev = dev;
584
585 return (hwt_backend_register(&backend));
586 }
587