1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2025 Oxide Computer Company 14 */ 15 16 /* 17 * PCIe shenanigans 18 * 19 * Currently this implements several different views at seeing into PCIe devices 20 * and is designed to (hopefully) replace pcitool and be a vector for new system 21 * functionality such as dealing with multicast filtering, ACS, etc. 22 * 23 * While most subcommands have their own implementations, there are a couple of 24 * things that are worth bearing in mind: 25 * 26 * 1) Where possible, prefer the use of libofmt. In particular, having good, 27 * parsable output is important. New subcommands should strive to meet that. 28 * 29 * 2) Because we're often processing binary data (and it's good hygiene), 30 * subcommands should make sure to drop privileges as early as they can by 31 * calling pcieadm_init_privs(). More on privileges below. 32 * 33 * Privilege Management 34 * -------------------- 35 * 36 * In an attempt to minimize privilege exposure, but to allow subcommands 37 * flexibility when required (e.g. show-cfgspace needs full privs to read from 38 * the kernel), we have two privilege sets that we maintain. One which is the 39 * minimial privs, which basically is a set that has stripped everything. This 40 * is 'pia_priv_min'. The second is one that allows a subcommand to add in 41 * privileges that it requires which will be left in the permitted set. These 42 * are in 'pia_priv_eff'. It's important to know that this set is always 43 * intersected with what the user actually has, so this is not meant to be a way 44 * for a caller to get more privileges than they already have. 45 * 46 * A subcommand is expected to call pcieadm_init_privs() once they have 47 * processed enough arguments that they can set an upper bound on privileges. 48 * It's worth noting that a subcommand will be executed in an already minimial 49 * environment; however, we will have already set up a libdevinfo handle for 50 * them, which should make the need to do much more not so bad. 51 */ 52 53 #include <stdio.h> 54 #include <stdlib.h> 55 #include <stdarg.h> 56 #include <unistd.h> 57 #include <err.h> 58 #include <libdevinfo.h> 59 #include <strings.h> 60 #include <sys/stat.h> 61 #include <sys/pci_tools.h> 62 #include <sys/pci.h> 63 #include <sys/types.h> 64 #include <fcntl.h> 65 #include <sys/debug.h> 66 #include <upanic.h> 67 #include <libgen.h> 68 69 #include "pcieadm.h" 70 71 pcieadm_t pcieadm; 72 const char *pcieadm_progname; 73 74 void 75 pcieadm_init_privs(pcieadm_t *pcip) 76 { 77 static const char *msg = "attempted to re-initialize privileges"; 78 if (pcip->pia_priv_init == NULL) { 79 upanic(msg, strlen(msg)); 80 } 81 82 priv_intersect(pcip->pia_priv_init, pcip->pia_priv_eff); 83 84 if (setppriv(PRIV_SET, PRIV_PERMITTED, pcieadm.pia_priv_eff) != 0) { 85 err(EXIT_FAILURE, "failed to reduce privileges"); 86 } 87 88 if (setppriv(PRIV_SET, PRIV_LIMIT, pcieadm.pia_priv_eff) != 0) { 89 err(EXIT_FAILURE, "failed to reduce privileges"); 90 } 91 92 priv_freeset(pcip->pia_priv_init); 93 pcip->pia_priv_init = NULL; 94 } 95 96 void 97 pcieadm_indent(void) 98 { 99 pcieadm.pia_indent += 2; 100 } 101 102 void 103 pcieadm_deindent(void) 104 { 105 VERIFY3U(pcieadm.pia_indent, >, 0); 106 pcieadm.pia_indent -= 2; 107 } 108 109 void 110 pcieadm_print(const char *fmt, ...) 111 { 112 va_list ap; 113 114 if (pcieadm.pia_indent > 0) { 115 (void) printf("%*s", pcieadm.pia_indent, ""); 116 } 117 118 va_start(ap, fmt); 119 (void) vprintf(fmt, ap); 120 va_end(ap); 121 } 122 123 void 124 pcieadm_ofmt_errx(const char *fmt, ...) 125 { 126 va_list ap; 127 128 va_start(ap, fmt); 129 verrx(EXIT_FAILURE, fmt, ap); 130 } 131 132 /* 133 * We determine if a node is PCI in a two step process. The first is to see if 134 * the node's name starts with pci, and has an additional character that 135 * indicates it's not the synthetic root of the tree. However, the node name 136 * changes for some classes of devices such as GPUs. As such, for those we try 137 * to look at the compatible property and see if we have a pciexclass or 138 * pciclass entry. We look specifically for the class to make sure that we don't 139 * fall for the synthetic nodes that have a compatible property of 140 * 'pciex_root_complex'. 141 * 142 * The compatible property is a single string that is actually a compressed 143 * string. That is, there are multiple strings concatenated together in a single 144 * pointer. 145 */ 146 static boolean_t 147 pcieadm_di_node_is_pci(di_node_t node) 148 { 149 const char *name; 150 char *compat; 151 int nents; 152 153 name = di_node_name(node); 154 if (strncmp("pci", name, 3) == 0) { 155 return (name[3] != '\0'); 156 } 157 158 nents = di_prop_lookup_strings(DDI_DEV_T_ANY, node, "compatible", 159 &compat); 160 if (nents <= 0) { 161 return (B_FALSE); 162 } 163 164 for (int i = 0; i < nents; i++) { 165 if (strncmp("pciclass,", compat, strlen("pciclass,")) == 0 || 166 strncmp("pciexclass,", compat, strlen("pciexclass,")) == 167 0) { 168 return (B_TRUE); 169 } 170 171 compat += strlen(compat) + 1; 172 } 173 174 return (B_FALSE); 175 } 176 177 static int 178 pcieadm_di_walk_cb(di_node_t node, void *arg) 179 { 180 pcieadm_di_walk_t *walk = arg; 181 182 if (!pcieadm_di_node_is_pci(node)) { 183 return (DI_WALK_CONTINUE); 184 } 185 186 return (walk->pdw_func(node, walk->pdw_arg)); 187 } 188 189 static di_node_t 190 pcieadm_di_root(pcieadm_t *pcip) 191 { 192 if (pcip->pia_root == DI_NODE_NIL) { 193 pcip->pia_root = di_init("/", DINFOCPYALL); 194 if (pcip->pia_root == DI_NODE_NIL) { 195 err(EXIT_FAILURE, "failed to initialize devinfo tree"); 196 } 197 } 198 199 return (pcip->pia_root); 200 } 201 202 void 203 pcieadm_di_walk(pcieadm_t *pcip, pcieadm_di_walk_t *arg) 204 { 205 (void) di_walk_node(pcieadm_di_root(pcip), DI_WALK_CLDFIRST, arg, 206 pcieadm_di_walk_cb); 207 } 208 209 /* 210 * Attempt to find the nexus that corresponds to this device. To do this, we 211 * walk up and walk the minors until we find a "reg" minor. 212 */ 213 void 214 pcieadm_find_nexus(pcieadm_t *pia) 215 { 216 di_node_t cur; 217 218 for (cur = di_parent_node(pia->pia_devi); cur != DI_NODE_NIL; 219 cur = di_parent_node(cur)) { 220 di_minor_t minor = DI_MINOR_NIL; 221 222 while ((minor = di_minor_next(cur, minor)) != DI_MINOR_NIL) { 223 if (di_minor_spectype(minor) == S_IFCHR && 224 strcmp(di_minor_name(minor), "reg") == 0) { 225 pia->pia_nexus = cur; 226 return; 227 } 228 } 229 } 230 } 231 232 static int 233 pcieadm_find_dip_cb(di_node_t node, void *arg) 234 { 235 char *path = NULL, *driver; 236 char dinst[128], bdf[128], altbdf[128]; 237 int inst, nprop, *regs; 238 pcieadm_t *pia = arg; 239 240 path = di_devfs_path(node); 241 if (path == NULL) { 242 err(EXIT_FAILURE, "failed to construct devfs path for node: " 243 "%s", di_node_name(node)); 244 } 245 246 driver = di_driver_name(node); 247 inst = di_instance(node); 248 if (driver != NULL && inst != -1) { 249 (void) snprintf(dinst, sizeof (dinst), "%s%d", driver, inst); 250 } 251 252 nprop = di_prop_lookup_ints(DDI_DEV_T_ANY, node, "reg", ®s); 253 if (nprop <= 0) { 254 errx(EXIT_FAILURE, "failed to lookup regs array for %s", 255 path); 256 } 257 (void) snprintf(bdf, sizeof (bdf), "%x/%x/%x", PCI_REG_BUS_G(regs[0]), 258 PCI_REG_DEV_G(regs[0]), PCI_REG_FUNC_G(regs[0])); 259 (void) snprintf(altbdf, sizeof (altbdf), "%02x/%02x/%02x", 260 PCI_REG_BUS_G(regs[0]), PCI_REG_DEV_G(regs[0]), 261 PCI_REG_FUNC_G(regs[0])); 262 263 if (strcmp(pia->pia_devstr, path) == 0 || 264 strcmp(pia->pia_devstr, bdf) == 0 || 265 strcmp(pia->pia_devstr, altbdf) == 0 || 266 (driver != NULL && inst != -1 && 267 strcmp(pia->pia_devstr, dinst) == 0)) { 268 if (pia->pia_devi != DI_NODE_NIL) { 269 errx(EXIT_FAILURE, "device name matched two device " 270 "nodes: %s and %s", di_node_name(pia->pia_devi), 271 di_node_name(node)); 272 } 273 274 pia->pia_devi = node; 275 } 276 277 if (path != NULL) { 278 di_devfs_path_free(path); 279 } 280 281 return (DI_WALK_CONTINUE); 282 } 283 284 void 285 pcieadm_find_dip(pcieadm_t *pcip, const char *device) 286 { 287 pcieadm_di_walk_t walk; 288 289 /* 290 * If someone specifies /devices, just skip over it. 291 */ 292 pcip->pia_devstr = device; 293 if (strncmp("/devices", device, strlen("/devices")) == 0) { 294 pcip->pia_devstr += strlen("/devices"); 295 } 296 297 pcip->pia_devi = DI_NODE_NIL; 298 walk.pdw_arg = pcip; 299 walk.pdw_func = pcieadm_find_dip_cb; 300 pcieadm_di_walk(pcip, &walk); 301 302 if (pcip->pia_devi == DI_NODE_NIL) { 303 errx(EXIT_FAILURE, "failed to find device node %s", device); 304 } 305 306 pcip->pia_nexus = DI_NODE_NIL; 307 pcieadm_find_nexus(pcip); 308 if (pcip->pia_nexus == DI_NODE_NIL) { 309 errx(EXIT_FAILURE, "failed to find nexus for %s", device); 310 } 311 } 312 313 typedef struct pcieadm_cfgspace_file { 314 int pcfi_fd; 315 } pcieadm_cfgspace_file_t; 316 317 static boolean_t 318 pcieadm_read_cfgspace_file(uint32_t off, uint8_t len, void *buf, void *arg) 319 { 320 uint32_t bufoff = 0; 321 pcieadm_cfgspace_file_t *pcfi = arg; 322 323 while (len > 0) { 324 ssize_t ret = pread(pcfi->pcfi_fd, buf + bufoff, len, off); 325 if (ret < 0) { 326 err(EXIT_FAILURE, "failed to read %u bytes at %" 327 PRIu32, len, off); 328 } else if (ret == 0) { 329 warnx("hit unexpected EOF reading cfgspace from file " 330 "at offest %" PRIu32 ", still wanted to read %u " 331 "bytes", off, len); 332 return (B_FALSE); 333 } else { 334 len -= ret; 335 off += ret; 336 bufoff += ret; 337 } 338 339 } 340 341 return (B_TRUE); 342 } 343 344 void 345 pcieadm_init_cfgspace_file(pcieadm_t *pcip, const char *path, 346 pcieadm_cfgspace_f *funcp, void **arg) 347 { 348 int fd; 349 struct stat st; 350 pcieadm_cfgspace_file_t *pcfi; 351 352 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_eff) != 0) { 353 err(EXIT_FAILURE, "failed to raise privileges"); 354 } 355 356 if ((fd = open(path, O_RDONLY)) < 0) { 357 err(EXIT_FAILURE, "failed to open input file %s", path); 358 } 359 360 if (fstat(fd, &st) != 0) { 361 err(EXIT_FAILURE, "failed to get stat information for %s", 362 path); 363 } 364 365 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_min) != 0) { 366 err(EXIT_FAILURE, "failed to reduce privileges"); 367 } 368 369 if (S_ISDIR(st.st_mode)) { 370 errx(EXIT_FAILURE, "input file %s is a directory, unable " 371 "to read data", path); 372 } 373 374 if (S_ISLNK(st.st_mode)) { 375 errx(EXIT_FAILURE, "input file %s is a symbolic link, unable " 376 "to read data", path); 377 } 378 379 if (S_ISDOOR(st.st_mode)) { 380 errx(EXIT_FAILURE, "input file %s is a door, unable " 381 "to read data", path); 382 } 383 384 if (S_ISPORT(st.st_mode)) { 385 errx(EXIT_FAILURE, "input file %s is an event port, unable " 386 "to read data", path); 387 } 388 389 /* 390 * Assume if we were given a FIFO, character/block device, socket, or 391 * something else that it's probably fine. 392 */ 393 pcfi = calloc(1, sizeof (*pcfi)); 394 if (pcfi == NULL) { 395 err(EXIT_FAILURE, "failed to allocate memory for reading " 396 "cfgspace data from a file"); 397 } 398 399 pcfi->pcfi_fd = fd; 400 *arg = pcfi; 401 *funcp = pcieadm_read_cfgspace_file; 402 } 403 404 void 405 pcieadm_fini_cfgspace_file(void *arg) 406 { 407 pcieadm_cfgspace_file_t *pcfi = arg; 408 VERIFY0(close(pcfi->pcfi_fd)); 409 free(pcfi); 410 } 411 412 typedef struct pcieadm_cfgspace_kernel { 413 pcieadm_t *pck_pci; 414 int pck_fd; 415 uint8_t pck_bus; 416 uint8_t pck_dev; 417 uint8_t pck_func; 418 } pcieadm_cfgspace_kernel_t; 419 420 static boolean_t 421 pcieadm_read_cfgspace_kernel(uint32_t off, uint8_t len, void *buf, void *arg) 422 { 423 pcieadm_cfgspace_kernel_t *pck = arg; 424 pcieadm_t *pcip = pck->pck_pci; 425 pcitool_reg_t pci_reg; 426 427 bzero(&pci_reg, sizeof (pci_reg)); 428 pci_reg.user_version = PCITOOL_VERSION; 429 pci_reg.bus_no = pck->pck_bus; 430 pci_reg.dev_no = pck->pck_dev; 431 pci_reg.func_no = pck->pck_func; 432 pci_reg.barnum = 0; 433 pci_reg.offset = off; 434 pci_reg.acc_attr = PCITOOL_ACC_ATTR_ENDN_LTL; 435 436 switch (len) { 437 case 1: 438 pci_reg.acc_attr += PCITOOL_ACC_ATTR_SIZE_1; 439 break; 440 case 2: 441 pci_reg.acc_attr += PCITOOL_ACC_ATTR_SIZE_2; 442 break; 443 case 4: 444 pci_reg.acc_attr += PCITOOL_ACC_ATTR_SIZE_4; 445 break; 446 case 8: 447 pci_reg.acc_attr += PCITOOL_ACC_ATTR_SIZE_8; 448 break; 449 default: 450 errx(EXIT_FAILURE, "asked to read invalid size from kernel: %u", 451 len); 452 } 453 454 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_eff) != 0) { 455 err(EXIT_FAILURE, "failed to raise privileges"); 456 } 457 458 if (ioctl(pck->pck_fd, PCITOOL_DEVICE_GET_REG, &pci_reg) != 0) { 459 err(EXIT_FAILURE, "failed to read device offset 0x%x", off); 460 } 461 462 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_min) != 0) { 463 err(EXIT_FAILURE, "failed to reduce privileges"); 464 } 465 466 switch (len) { 467 case 1: 468 *(uint8_t *)buf = (uint8_t)pci_reg.data; 469 break; 470 case 2: 471 *(uint16_t *)buf = (uint16_t)pci_reg.data; 472 break; 473 case 4: 474 *(uint32_t *)buf = (uint32_t)pci_reg.data; 475 break; 476 case 8: 477 *(uint64_t *)buf = (uint64_t)pci_reg.data; 478 break; 479 } 480 481 return (B_TRUE); 482 } 483 484 void 485 pcieadm_init_cfgspace_kernel(pcieadm_t *pcip, pcieadm_cfgspace_f *funcp, 486 void **arg) 487 { 488 char *nexus_base; 489 char nexus_reg[PATH_MAX]; 490 int fd, nregs, *regs; 491 pcieadm_cfgspace_kernel_t *pck; 492 493 if ((nexus_base = di_devfs_path(pcip->pia_nexus)) == NULL) { 494 err(EXIT_FAILURE, "failed to get path to nexus node"); 495 } 496 497 if (snprintf(nexus_reg, sizeof (nexus_reg), "/devices%s:reg", 498 nexus_base) >= sizeof (nexus_reg)) { 499 errx(EXIT_FAILURE, "failed to construct nexus path, path " 500 "overflow"); 501 } 502 free(nexus_base); 503 504 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_eff) != 0) { 505 err(EXIT_FAILURE, "failed to raise privileges"); 506 } 507 508 if ((fd = open(nexus_reg, O_RDONLY)) < 0) { 509 err(EXIT_FAILURE, "failed to open %s", nexus_reg); 510 } 511 512 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcip->pia_priv_min) != 0) { 513 err(EXIT_FAILURE, "failed to reduce privileges"); 514 } 515 516 nregs = di_prop_lookup_ints(DDI_DEV_T_ANY, pcip->pia_devi, "reg", 517 ®s); 518 if (nregs <= 0) { 519 errx(EXIT_FAILURE, "failed to lookup regs array for %s", 520 pcip->pia_devstr); 521 } 522 523 pck = calloc(1, sizeof (pcieadm_cfgspace_kernel_t)); 524 if (pck == NULL) { 525 err(EXIT_FAILURE, "failed to allocate memory for reading " 526 "kernel cfgspace data"); 527 } 528 529 pck->pck_pci = pcip; 530 pck->pck_fd = fd; 531 pck->pck_bus = PCI_REG_BUS_G(regs[0]); 532 pck->pck_dev = PCI_REG_DEV_G(regs[0]); 533 pck->pck_func = PCI_REG_FUNC_G(regs[0]); 534 535 *funcp = pcieadm_read_cfgspace_kernel; 536 *arg = pck; 537 } 538 539 void 540 pcieadm_fini_cfgspace_kernel(void *arg) 541 { 542 pcieadm_cfgspace_kernel_t *pck = arg; 543 544 VERIFY0(close(pck->pck_fd)); 545 free(pck); 546 } 547 548 static const pcieadm_cmdtab_t pcieadm_cmds[] = { 549 { "save-cfgspace", pcieadm_save_cfgspace, pcieadm_save_cfgspace_usage }, 550 { "show-cfgspace", pcieadm_show_cfgspace, pcieadm_show_cfgspace_usage }, 551 { "show-devs", pcieadm_show_devs, pcieadm_show_devs_usage }, 552 { NULL } 553 }; 554 555 static void 556 pcieadm_usage(const char *format, ...) 557 { 558 uint_t cmd; 559 560 if (format != NULL) { 561 va_list ap; 562 563 va_start(ap, format); 564 vwarnx(format, ap); 565 va_end(ap); 566 } 567 568 (void) fprintf(stderr, "usage: %s <subcommand> <args> ...\n\n", 569 pcieadm_progname); 570 571 for (cmd = 0; pcieadm_cmds[cmd].pct_name != NULL; cmd++) { 572 if (pcieadm_cmds[cmd].pct_use != NULL) { 573 pcieadm_cmds[cmd].pct_use(stderr); 574 } 575 } 576 } 577 578 int 579 main(int argc, char *argv[]) 580 { 581 uint_t cmd; 582 583 pcieadm_progname = basename(argv[0]); 584 585 if (argc < 2) { 586 pcieadm_usage("missing required sub-command"); 587 exit(EXIT_USAGE); 588 } 589 590 for (cmd = 0; pcieadm_cmds[cmd].pct_name != NULL; cmd++) { 591 if (strcmp(pcieadm_cmds[cmd].pct_name, argv[1]) == 0) { 592 break; 593 } 594 } 595 596 if (pcieadm_cmds[cmd].pct_name == NULL) { 597 pcieadm_usage("unknown sub-command: %s", argv[1]); 598 exit(EXIT_USAGE); 599 } 600 argc -= 2; 601 argv += 2; 602 optind = 0; 603 pcieadm.pia_cmdtab = &pcieadm_cmds[cmd]; 604 605 /* 606 * Set up common things that all of pcieadm needs before dispatching to 607 * a specific sub-command. 608 */ 609 pcieadm.pia_pcidb = pcidb_open(PCIDB_VERSION); 610 if (pcieadm.pia_pcidb == NULL) { 611 err(EXIT_FAILURE, "failed to open PCI ID database"); 612 } 613 614 /* 615 * Set up privileges now that we have already opened our core libraries. 616 * We first set up the minimum actual privilege set that we use while 617 * running. We next set up a second privilege set that has additional 618 * privileges that are intersected with the users actual privileges and 619 * are appended to by the underlying command backends. 620 */ 621 if ((pcieadm.pia_priv_init = priv_allocset()) == NULL) { 622 err(EXIT_FAILURE, "failed to allocate privilege set"); 623 } 624 625 if (getppriv(PRIV_EFFECTIVE, pcieadm.pia_priv_init) != 0) { 626 err(EXIT_FAILURE, "failed to get current privileges"); 627 } 628 629 if ((pcieadm.pia_priv_min = priv_allocset()) == NULL) { 630 err(EXIT_FAILURE, "failed to allocate privilege set"); 631 } 632 633 if ((pcieadm.pia_priv_eff = priv_allocset()) == NULL) { 634 err(EXIT_FAILURE, "failed to allocate privilege set"); 635 } 636 637 /* 638 * Note, PRIV_FILE_READ is not removed from the basic set so that way we 639 * can still open libraries that are required due to lazy loading. 640 */ 641 priv_basicset(pcieadm.pia_priv_min); 642 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_FILE_LINK_ANY)); 643 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_PROC_INFO)); 644 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_PROC_SESSION)); 645 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_PROC_FORK)); 646 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_NET_ACCESS)); 647 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_FILE_WRITE)); 648 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_PROC_EXEC)); 649 VERIFY0(priv_delset(pcieadm.pia_priv_min, PRIV_PROC_EXEC)); 650 651 priv_copyset(pcieadm.pia_priv_min, pcieadm.pia_priv_eff); 652 priv_intersect(pcieadm.pia_priv_init, pcieadm.pia_priv_eff); 653 654 if (setppriv(PRIV_SET, PRIV_EFFECTIVE, pcieadm.pia_priv_min) != 0) { 655 err(EXIT_FAILURE, "failed to reduce privileges"); 656 } 657 658 return (pcieadm.pia_cmdtab->pct_func(&pcieadm, argc, argv)); 659 } 660