1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Arm Ltd 5 * Copyright (c) 2022 The FreeBSD Foundation 6 * 7 * Portions of this software were developed by Andrew Turner under sponsorship 8 * from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Arm Statistical Profiling Extension (SPE) backend 34 * 35 * Basic SPE operation 36 * 37 * SPE is enabled and configured on a per-core basis, with each core requiring 38 * separate code to enable and configure. Each core also requires a separate 39 * buffer passed as config where the CPU will write profiling data. When the 40 * profiling buffer is full, an interrupt will be taken on the same CPU. 41 * 42 * Driver Design 43 * 44 * - HWT allocates a large single buffer per core. This buffer is split in half 45 * to create a 2 element circular buffer (aka ping-pong buffer) where the 46 * kernel writes to one half while userspace is copying the other half 47 * - SMP calls are used to enable and configure each core, with SPE initially 48 * configured to write to the first half of the buffer 49 * - When the first half of the buffer is full, a buffer full interrupt will 50 * immediately switch writing to the second half. The kernel adds the details 51 * of the half that needs copying to a FIFO STAILQ and notifies userspace via 52 * kqueue by sending a ARM_SPE_KQ_BUF kevent with how many buffers on the 53 * queue need servicing 54 * - The kernel responds to HWT_IOC_BUFPTR_GET ioctl by sending details of the 55 * first item from the queue 56 * - The buffers pending copying will not be overwritten until an 57 * HWT_IOC_SVC_BUF ioctl is received from userspace confirming the data has 58 * been copied out 59 * - In the case where both halfs of the buffer are full, profiling will be 60 * paused until notification via HWT_IOC_SVC_BUF is received 61 * 62 * Future improvements and limitations 63 * 64 * - Using large buffer sizes should minimise pauses and loss of profiling 65 * data while kernel is waiting for userspace to copy out data. Since it is 66 * generally expected that consuming (copying) this data is faster than 67 * producing it, in practice this has not so far been an issue. If it does 68 * prove to be an issue even with large buffer sizes then additional buffering 69 * i.e. n element circular buffers might be required. 70 * 71 * - kqueue can only notify and queue one kevent of the same type, with 72 * subsequent events overwriting data in the first event. The kevent 73 * ARM_SPE_KQ_BUF can therefore only contain the number of buffers on the 74 * STAILQ, incrementing each time a new buffer is full. In this case kqueue 75 * serves just as a notification to userspace to wake up and query the kernel 76 * with the appropriate ioctl. An alternative might be custom kevents where 77 * the kevent identifier is encoded with something like n+cpu_id or n+tid. In 78 * this case data could be sent directly with kqueue via the kevent data and 79 * fflags elements, avoiding the extra ioctl. 80 * 81 */ 82 83 #include <sys/param.h> 84 #include <sys/bus.h> 85 #include <sys/conf.h> 86 #include <sys/hwt.h> 87 #include <sys/kernel.h> 88 #include <sys/lock.h> 89 #include <sys/malloc.h> 90 #include <sys/mman.h> 91 #include <sys/module.h> 92 #include <sys/mutex.h> 93 #include <sys/proc.h> 94 #include <sys/rman.h> 95 #include <sys/rwlock.h> 96 #include <sys/smp.h> 97 #include <sys/sysctl.h> 98 #include <sys/systm.h> 99 100 #include <machine/bus.h> 101 102 #include <arm64/spe/arm_spe_dev.h> 103 104 #include <dev/hwt/hwt_vm.h> 105 #include <dev/hwt/hwt_backend.h> 106 #include <dev/hwt/hwt_config.h> 107 #include <dev/hwt/hwt_context.h> 108 #include <dev/hwt/hwt_cpu.h> 109 #include <dev/hwt/hwt_thread.h> 110 111 MALLOC_DECLARE(M_ARM_SPE); 112 113 extern u_int mp_maxid; 114 extern struct taskqueue *taskqueue_arm_spe; 115 116 int spe_backend_disable_smp(struct hwt_context *ctx); 117 118 static device_t spe_dev; 119 static struct hwt_backend_ops spe_ops; 120 static struct hwt_backend backend = { 121 .ops = &spe_ops, 122 .name = "spe", 123 .kva_req = 1, 124 }; 125 126 static struct arm_spe_info *spe_info; 127 128 static int 129 spe_backend_init_thread(struct hwt_context *ctx) 130 { 131 return (ENOTSUP); 132 } 133 134 static void 135 spe_backend_init_cpu(struct hwt_context *ctx) 136 { 137 struct arm_spe_info *info; 138 struct arm_spe_softc *sc = device_get_softc(spe_dev); 139 char lock_name[32]; 140 char *tmp = "Arm SPE lock/cpu/"; 141 int cpu_id; 142 143 spe_info = malloc(sizeof(struct arm_spe_info) * mp_ncpus, 144 M_ARM_SPE, M_WAITOK | M_ZERO); 145 146 sc->spe_info = spe_info; 147 148 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 149 info = &spe_info[cpu_id]; 150 info->sc = sc; 151 info->ident = cpu_id; 152 info->buf_info[0].info = info; 153 info->buf_info[0].buf_idx = 0; 154 info->buf_info[1].info = info; 155 info->buf_info[1].buf_idx = 1; 156 snprintf(lock_name, sizeof(lock_name), "%s%d", tmp, cpu_id); 157 mtx_init(&info->lock, lock_name, NULL, MTX_SPIN); 158 } 159 } 160 161 static int 162 spe_backend_init(struct hwt_context *ctx) 163 { 164 struct arm_spe_softc *sc = device_get_softc(spe_dev); 165 int error = 0; 166 167 /* 168 * HWT currently specifies buffer size must be a multiple of PAGE_SIZE, 169 * i.e. minimum 4KB + the maximum PMBIDR.Align is 2KB 170 * This should never happen but it's good to sense check 171 */ 172 if (ctx->bufsize % sc->kva_align != 0) 173 return (EINVAL); 174 175 /* 176 * Since we're splitting the buffer in half + PMBLIMITR needs to be page 177 * aligned, minimum buffer size needs to be 2x PAGE_SIZE 178 */ 179 if (ctx->bufsize < (2 * PAGE_SIZE)) 180 return (EINVAL); 181 182 sc->ctx = ctx; 183 sc->kqueue_fd = ctx->kqueue_fd; 184 sc->hwt_td = ctx->hwt_td; 185 186 if (ctx->mode == HWT_MODE_THREAD) 187 error = spe_backend_init_thread(ctx); 188 else 189 spe_backend_init_cpu(ctx); 190 191 return (error); 192 } 193 194 #ifdef ARM_SPE_DEBUG 195 static void hex_dump(uint8_t *buf, size_t len) 196 { 197 size_t i; 198 199 printf("--------------------------------------------------------------\n"); 200 for (i = 0; i < len; ++i) { 201 if (i % 8 == 0) { 202 printf(" "); 203 } 204 if (i % 16 == 0) { 205 if (i != 0) { 206 printf("\r\n"); 207 } 208 printf("\t"); 209 } 210 printf("%02X ", buf[i]); 211 } 212 printf("\r\n"); 213 } 214 #endif 215 216 static int 217 spe_backend_deinit(struct hwt_context *ctx) 218 { 219 #ifdef ARM_SPE_DEBUG 220 struct arm_spe_info *info; 221 int cpu_id; 222 223 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 224 info = &spe_info[cpu_id]; 225 hex_dump((void *)info->kvaddr, 128); 226 hex_dump((void *)(info->kvaddr + (info->buf_size/2)), 128); 227 } 228 #endif 229 230 if (ctx->state == CTX_STATE_RUNNING) { 231 spe_backend_disable_smp(ctx); 232 ctx->state = CTX_STATE_STOPPED; 233 } 234 235 free(spe_info, M_ARM_SPE); 236 237 return (0); 238 } 239 240 static uint64_t 241 arm_spe_min_interval(struct arm_spe_softc *sc) 242 { 243 /* IMPLEMENTATION DEFINED */ 244 switch (PMSIDR_Interval_VAL(sc->pmsidr)) 245 { 246 case PMSIDR_Interval_256: 247 return (256); 248 case PMSIDR_Interval_512: 249 return (512); 250 case PMSIDR_Interval_768: 251 return (768); 252 case PMSIDR_Interval_1024: 253 return (1024); 254 case PMSIDR_Interval_1536: 255 return (1536); 256 case PMSIDR_Interval_2048: 257 return (2048); 258 case PMSIDR_Interval_3072: 259 return (3072); 260 case PMSIDR_Interval_4096: 261 return (4096); 262 default: 263 return (4096); 264 } 265 } 266 267 static inline void 268 arm_spe_set_interval(struct arm_spe_info *info, uint64_t interval) 269 { 270 uint64_t min_interval = arm_spe_min_interval(info->sc); 271 272 interval = MAX(interval, min_interval); 273 interval = MIN(interval, 1 << 24); /* max 24 bits */ 274 275 dprintf("%s %lu\n", __func__, interval); 276 277 info->pmsirr &= ~(PMSIRR_INTERVAL_MASK); 278 info->pmsirr |= (interval << PMSIRR_INTERVAL_SHIFT); 279 } 280 281 static int 282 spe_backend_configure(struct hwt_context *ctx, int cpu_id, int session_id) 283 { 284 struct arm_spe_info *info = &spe_info[cpu_id]; 285 struct arm_spe_config *cfg; 286 int err = 0; 287 288 mtx_lock_spin(&info->lock); 289 info->ident = cpu_id; 290 /* Set defaults */ 291 info->pmsfcr = 0; 292 info->pmsevfr = 0xFFFFFFFFFFFFFFFFUL; 293 info->pmslatfr = 0; 294 info->pmsirr = 295 (arm_spe_min_interval(info->sc) << PMSIRR_INTERVAL_SHIFT) 296 | PMSIRR_RND; 297 info->pmsicr = 0; 298 info->pmscr = PMSCR_TS | PMSCR_PA | PMSCR_CX | PMSCR_E1SPE | PMSCR_E0SPE; 299 300 if (ctx->config != NULL && 301 ctx->config_size == sizeof(struct arm_spe_config) && 302 ctx->config_version == 1) { 303 cfg = (struct arm_spe_config *)ctx->config; 304 if (cfg->interval) 305 arm_spe_set_interval(info, cfg->interval); 306 if (cfg->level == ARM_SPE_KERNEL_ONLY) 307 info->pmscr &= ~(PMSCR_E0SPE); /* turn off user */ 308 if (cfg->level == ARM_SPE_USER_ONLY) 309 info->pmscr &= ~(PMSCR_E1SPE); /* turn off kern */ 310 if (cfg->ctx_field) 311 info->ctx_field = cfg->ctx_field; 312 } else 313 err = (EINVAL); 314 mtx_unlock_spin(&info->lock); 315 316 return (err); 317 } 318 319 320 static void 321 arm_spe_enable(void *arg __unused) 322 { 323 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)]; 324 uint64_t base, limit; 325 326 dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid)); 327 328 mtx_lock_spin(&info->lock); 329 330 if (info->ctx_field == ARM_SPE_CTX_CPU_ID) 331 WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, PCPU_GET(cpuid)); 332 333 WRITE_SPECIALREG(PMSFCR_EL1_REG, info->pmsfcr); 334 WRITE_SPECIALREG(PMSEVFR_EL1_REG, info->pmsevfr); 335 WRITE_SPECIALREG(PMSLATFR_EL1_REG, info->pmslatfr); 336 337 /* Set the sampling interval */ 338 WRITE_SPECIALREG(PMSIRR_EL1_REG, info->pmsirr); 339 isb(); 340 341 /* Write 0 here before enabling sampling */ 342 WRITE_SPECIALREG(PMSICR_EL1_REG, info->pmsicr); 343 isb(); 344 345 base = info->kvaddr; 346 limit = base + (info->buf_size/2); 347 /* Enable the buffer */ 348 limit &= PMBLIMITR_LIMIT_MASK; /* Zero lower 12 bits */ 349 limit |= PMBLIMITR_E; 350 /* Set the base and limit */ 351 WRITE_SPECIALREG(PMBPTR_EL1_REG, base); 352 WRITE_SPECIALREG(PMBLIMITR_EL1_REG, limit); 353 isb(); 354 355 /* Enable sampling */ 356 WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr); 357 isb(); 358 359 info->enabled = true; 360 361 mtx_unlock_spin(&info->lock); 362 } 363 364 static int 365 spe_backend_enable_smp(struct hwt_context *ctx) 366 { 367 struct arm_spe_info *info; 368 struct hwt_vm *vm; 369 int cpu_id; 370 371 HWT_CTX_LOCK(ctx); 372 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 373 vm = hwt_cpu_get(ctx, cpu_id)->vm; 374 375 info = &spe_info[cpu_id]; 376 377 mtx_lock_spin(&info->lock); 378 info->kvaddr = vm->kvaddr; 379 info->buf_size = ctx->bufsize; 380 mtx_unlock_spin(&info->lock); 381 } 382 HWT_CTX_UNLOCK(ctx); 383 384 cpu_id = CPU_FFS(&ctx->cpu_map) - 1; 385 info = &spe_info[cpu_id]; 386 if (info->ctx_field == ARM_SPE_CTX_PID) 387 arm64_pid_in_contextidr = true; 388 else 389 arm64_pid_in_contextidr = false; 390 391 smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier, 392 arm_spe_enable, smp_no_rendezvous_barrier, NULL); 393 394 return (0); 395 } 396 397 void 398 arm_spe_disable(void *arg __unused) 399 { 400 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)]; 401 struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx]; 402 403 if (!info->enabled) 404 return; 405 406 dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid)); 407 408 /* Disable profiling */ 409 WRITE_SPECIALREG(PMSCR_EL1_REG, 0x0); 410 isb(); 411 412 /* Drain any remaining tracing data */ 413 psb_csync(); 414 dsb(nsh); 415 416 /* Disable the profiling buffer */ 417 WRITE_SPECIALREG(PMBLIMITR_EL1_REG, 0); 418 isb(); 419 420 /* Clear interrupt status reg */ 421 WRITE_SPECIALREG(PMBSR_EL1_REG, 0x0); 422 423 /* Clear PID/CPU_ID from context ID reg */ 424 WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, 0); 425 426 mtx_lock_spin(&info->lock); 427 buf->pmbptr = READ_SPECIALREG(PMBPTR_EL1_REG); 428 info->enabled = false; 429 mtx_unlock_spin(&info->lock); 430 } 431 432 int 433 spe_backend_disable_smp(struct hwt_context *ctx) 434 { 435 struct kevent kev; 436 struct arm_spe_info *info; 437 struct arm_spe_buf_info *buf; 438 int cpu_id; 439 int ret; 440 441 /* Disable and send out remaining data in bufs */ 442 smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier, 443 arm_spe_disable, smp_no_rendezvous_barrier, NULL); 444 445 CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) { 446 info = &spe_info[cpu_id]; 447 buf = &info->buf_info[info->buf_idx]; 448 arm_spe_send_buffer(buf, 0); 449 } 450 451 arm64_pid_in_contextidr = false; 452 453 /* 454 * Tracing on all CPUs has been disabled, and we've sent write ptr 455 * offsets for all bufs - let userspace know it can shutdown 456 */ 457 EV_SET(&kev, ARM_SPE_KQ_SHUTDOWN, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); 458 ret = kqfd_register(ctx->kqueue_fd, &kev, ctx->hwt_td, M_WAITOK); 459 if (ret) 460 dprintf("%s kqfd_register ret:%d\n", __func__, ret); 461 462 return (0); 463 } 464 465 static void 466 spe_backend_stop(struct hwt_context *ctx) 467 { 468 spe_backend_disable_smp(ctx); 469 } 470 471 static void 472 arm_spe_reenable(void *arg __unused) 473 { 474 struct arm_spe_info *info = &spe_info[PCPU_GET(cpuid)];; 475 476 WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr); 477 isb(); 478 } 479 480 static int 481 spe_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size, 482 int data_version) 483 { 484 struct arm_spe_info *info; 485 struct arm_spe_buf_info *buf; 486 struct arm_spe_svc_buf *s; 487 int err = 0; 488 cpuset_t cpu_set; 489 490 if (data_size != sizeof(struct arm_spe_svc_buf)) 491 return (E2BIG); 492 493 if (data_version != 1) 494 return (EINVAL); 495 496 s = (struct arm_spe_svc_buf *)data; 497 if (s->buf_idx > 1) 498 return (ENODEV); 499 if (s->ident >= mp_ncpus) 500 return (EINVAL); 501 502 info = &spe_info[s->ident]; 503 mtx_lock_spin(&info->lock); 504 505 buf = &info->buf_info[s->buf_idx]; 506 507 if (!info->enabled) { 508 err = ENXIO; 509 goto end; 510 } 511 512 /* Clear the flag the signals buffer needs servicing */ 513 buf->buf_svc = false; 514 515 /* Re-enable profiling if we've been waiting for this notification */ 516 if (buf->buf_wait) { 517 CPU_SETOF(s->ident, &cpu_set); 518 519 mtx_unlock_spin(&info->lock); 520 smp_rendezvous_cpus(cpu_set, smp_no_rendezvous_barrier, 521 arm_spe_reenable, smp_no_rendezvous_barrier, NULL); 522 mtx_lock_spin(&info->lock); 523 524 buf->buf_wait = false; 525 } 526 527 end: 528 mtx_unlock_spin(&info->lock); 529 return (err); 530 } 531 532 static int 533 spe_backend_read(struct hwt_vm *vm, int *ident, vm_offset_t *offset, 534 uint64_t *data) 535 { 536 struct arm_spe_queue *q; 537 struct arm_spe_softc *sc = device_get_softc(spe_dev); 538 int error = 0; 539 540 mtx_lock_spin(&sc->sc_lock); 541 542 /* Return the first pending buffer that needs servicing */ 543 q = STAILQ_FIRST(&sc->pending); 544 if (q == NULL) { 545 error = ENOENT; 546 goto error; 547 } 548 *ident = q->ident; 549 *offset = q->offset; 550 *data = (q->buf_idx << KQ_BUF_POS_SHIFT) | 551 (q->partial_rec << KQ_PARTREC_SHIFT) | 552 (q->final_buf << KQ_FINAL_BUF_SHIFT); 553 554 STAILQ_REMOVE_HEAD(&sc->pending, next); 555 sc->npending--; 556 557 error: 558 mtx_unlock_spin(&sc->sc_lock); 559 if (error) 560 return (error); 561 562 free(q, M_ARM_SPE); 563 return (0); 564 } 565 566 static struct hwt_backend_ops spe_ops = { 567 .hwt_backend_init = spe_backend_init, 568 .hwt_backend_deinit = spe_backend_deinit, 569 570 .hwt_backend_configure = spe_backend_configure, 571 .hwt_backend_svc_buf = spe_backend_svc_buf, 572 .hwt_backend_stop = spe_backend_stop, 573 574 .hwt_backend_enable_smp = spe_backend_enable_smp, 575 .hwt_backend_disable_smp = spe_backend_disable_smp, 576 577 .hwt_backend_read = spe_backend_read, 578 }; 579 580 int 581 spe_register(device_t dev) 582 { 583 spe_dev = dev; 584 585 return (hwt_backend_register(&backend)); 586 } 587