1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ 3 #include <linux/unaligned.h> 4 #include <linux/io-64-nonatomic-lo-hi.h> 5 #include <linux/moduleparam.h> 6 #include <linux/module.h> 7 #include <linux/delay.h> 8 #include <linux/sizes.h> 9 #include <linux/mutex.h> 10 #include <linux/list.h> 11 #include <linux/pci.h> 12 #include <linux/aer.h> 13 #include <linux/io.h> 14 #include <cxl/mailbox.h> 15 #include "cxlmem.h" 16 #include "cxlpci.h" 17 #include "cxl.h" 18 #include "pmu.h" 19 20 /** 21 * DOC: cxl pci 22 * 23 * This implements the PCI exclusive functionality for a CXL device as it is 24 * defined by the Compute Express Link specification. CXL devices may surface 25 * certain functionality even if it isn't CXL enabled. While this driver is 26 * focused around the PCI specific aspects of a CXL device, it binds to the 27 * specific CXL memory device class code, and therefore the implementation of 28 * cxl_pci is focused around CXL memory devices. 29 * 30 * The driver has several responsibilities, mainly: 31 * - Create the memX device and register on the CXL bus. 32 * - Enumerate device's register interface and map them. 33 * - Registers nvdimm bridge device with cxl_core. 34 * - Registers a CXL mailbox with cxl_core. 35 */ 36 37 #define cxl_doorbell_busy(cxlds) \ 38 (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \ 39 CXLDEV_MBOX_CTRL_DOORBELL) 40 41 /* CXL 2.0 - 8.2.8.4 */ 42 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ) 43 44 /* 45 * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to 46 * dictate how long to wait for the mailbox to become ready. The new 47 * field allows the device to tell software the amount of time to wait 48 * before mailbox ready. This field per the spec theoretically allows 49 * for up to 255 seconds. 255 seconds is unreasonably long, its longer 50 * than the maximum SATA port link recovery wait. Default to 60 seconds 51 * until someone builds a CXL device that needs more time in practice. 52 */ 53 static unsigned short mbox_ready_timeout = 60; 54 module_param(mbox_ready_timeout, ushort, 0644); 55 MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready"); 56 57 static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) 58 { 59 const unsigned long start = jiffies; 60 unsigned long end = start; 61 62 while (cxl_doorbell_busy(cxlds)) { 63 end = jiffies; 64 65 if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) { 66 /* Check again in case preempted before timeout test */ 67 if (!cxl_doorbell_busy(cxlds)) 68 break; 69 return -ETIMEDOUT; 70 } 71 cpu_relax(); 72 } 73 74 dev_dbg(cxlds->dev, "Doorbell wait took %dms", 75 jiffies_to_msecs(end) - jiffies_to_msecs(start)); 76 return 0; 77 } 78 79 #define cxl_err(dev, status, msg) \ 80 dev_err_ratelimited(dev, msg ", device state %s%s\n", \ 81 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 82 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 83 84 #define cxl_cmd_err(dev, cmd, status, msg) \ 85 dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \ 86 (cmd)->opcode, \ 87 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 88 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 89 90 /* 91 * Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique 92 * wrapper object for each irq within the same cxlds. 93 */ 94 struct cxl_dev_id { 95 struct cxl_dev_state *cxlds; 96 }; 97 98 static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq, 99 irq_handler_t thread_fn) 100 { 101 struct device *dev = cxlds->dev; 102 struct cxl_dev_id *dev_id; 103 104 dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL); 105 if (!dev_id) 106 return -ENOMEM; 107 dev_id->cxlds = cxlds; 108 109 return devm_request_threaded_irq(dev, irq, NULL, thread_fn, 110 IRQF_SHARED | IRQF_ONESHOT, NULL, 111 dev_id); 112 } 113 114 static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds) 115 { 116 u64 reg; 117 118 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 119 return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100; 120 } 121 122 static irqreturn_t cxl_pci_mbox_irq(int irq, void *id) 123 { 124 u64 reg; 125 u16 opcode; 126 struct cxl_dev_id *dev_id = id; 127 struct cxl_dev_state *cxlds = dev_id->cxlds; 128 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 129 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 130 131 if (!cxl_mbox_background_complete(cxlds)) 132 return IRQ_NONE; 133 134 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 135 opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg); 136 if (opcode == CXL_MBOX_OP_SANITIZE) { 137 mutex_lock(&cxl_mbox->mbox_mutex); 138 if (mds->security.sanitize_node) 139 mod_delayed_work(system_wq, &mds->security.poll_dwork, 0); 140 mutex_unlock(&cxl_mbox->mbox_mutex); 141 } else { 142 /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */ 143 rcuwait_wake_up(&cxl_mbox->mbox_wait); 144 } 145 146 return IRQ_HANDLED; 147 } 148 149 /* 150 * Sanitization operation polling mode. 151 */ 152 static void cxl_mbox_sanitize_work(struct work_struct *work) 153 { 154 struct cxl_memdev_state *mds = 155 container_of(work, typeof(*mds), security.poll_dwork.work); 156 struct cxl_dev_state *cxlds = &mds->cxlds; 157 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 158 159 mutex_lock(&cxl_mbox->mbox_mutex); 160 if (cxl_mbox_background_complete(cxlds)) { 161 mds->security.poll_tmo_secs = 0; 162 if (mds->security.sanitize_node) 163 sysfs_notify_dirent(mds->security.sanitize_node); 164 mds->security.sanitize_active = false; 165 166 dev_dbg(cxlds->dev, "Sanitization operation ended\n"); 167 } else { 168 int timeout = mds->security.poll_tmo_secs + 10; 169 170 mds->security.poll_tmo_secs = min(15 * 60, timeout); 171 schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ); 172 } 173 mutex_unlock(&cxl_mbox->mbox_mutex); 174 } 175 176 /** 177 * __cxl_pci_mbox_send_cmd() - Execute a mailbox command 178 * @cxl_mbox: CXL mailbox context 179 * @mbox_cmd: Command to send to the memory device. 180 * 181 * Context: Any context. Expects mbox_mutex to be held. 182 * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success. 183 * Caller should check the return code in @mbox_cmd to make sure it 184 * succeeded. 185 * 186 * This is a generic form of the CXL mailbox send command thus only using the 187 * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory 188 * devices, and perhaps other types of CXL devices may have further information 189 * available upon error conditions. Driver facilities wishing to send mailbox 190 * commands should use the wrapper command. 191 * 192 * The CXL spec allows for up to two mailboxes. The intention is for the primary 193 * mailbox to be OS controlled and the secondary mailbox to be used by system 194 * firmware. This allows the OS and firmware to communicate with the device and 195 * not need to coordinate with each other. The driver only uses the primary 196 * mailbox. 197 */ 198 static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox, 199 struct cxl_mbox_cmd *mbox_cmd) 200 { 201 struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox); 202 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 203 void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET; 204 struct device *dev = cxlds->dev; 205 u64 cmd_reg, status_reg; 206 size_t out_len; 207 int rc; 208 209 lockdep_assert_held(&cxl_mbox->mbox_mutex); 210 211 /* 212 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec. 213 * 1. Caller reads MB Control Register to verify doorbell is clear 214 * 2. Caller writes Command Register 215 * 3. Caller writes Command Payload Registers if input payload is non-empty 216 * 4. Caller writes MB Control Register to set doorbell 217 * 5. Caller either polls for doorbell to be clear or waits for interrupt if configured 218 * 6. Caller reads MB Status Register to fetch Return code 219 * 7. If command successful, Caller reads Command Register to get Payload Length 220 * 8. If output payload is non-empty, host reads Command Payload Registers 221 * 222 * Hardware is free to do whatever it wants before the doorbell is rung, 223 * and isn't allowed to change anything after it clears the doorbell. As 224 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can 225 * also happen in any order (though some orders might not make sense). 226 */ 227 228 /* #1 */ 229 if (cxl_doorbell_busy(cxlds)) { 230 u64 md_status = 231 readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 232 233 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, 234 "mailbox queue busy"); 235 return -EBUSY; 236 } 237 238 /* 239 * With sanitize polling, hardware might be done and the poller still 240 * not be in sync. Ensure no new command comes in until so. Keep the 241 * hardware semantics and only allow device health status. 242 */ 243 if (mds->security.poll_tmo_secs > 0) { 244 if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO) 245 return -EBUSY; 246 } 247 248 cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK, 249 mbox_cmd->opcode); 250 if (mbox_cmd->size_in) { 251 if (WARN_ON(!mbox_cmd->payload_in)) 252 return -EINVAL; 253 254 cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, 255 mbox_cmd->size_in); 256 memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in); 257 } 258 259 /* #2, #3 */ 260 writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 261 262 /* #4 */ 263 dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode); 264 writel(CXLDEV_MBOX_CTRL_DOORBELL, 265 cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 266 267 /* #5 */ 268 rc = cxl_pci_mbox_wait_for_doorbell(cxlds); 269 if (rc == -ETIMEDOUT) { 270 u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 271 272 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout"); 273 return rc; 274 } 275 276 /* #6 */ 277 status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET); 278 mbox_cmd->return_code = 279 FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg); 280 281 /* 282 * Handle the background command in a synchronous manner. 283 * 284 * All other mailbox commands will serialize/queue on the mbox_mutex, 285 * which we currently hold. Furthermore this also guarantees that 286 * cxl_mbox_background_complete() checks are safe amongst each other, 287 * in that no new bg operation can occur in between. 288 * 289 * Background operations are timesliced in accordance with the nature 290 * of the command. In the event of timeout, the mailbox state is 291 * indeterminate until the next successful command submission and the 292 * driver can get back in sync with the hardware state. 293 */ 294 if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) { 295 u64 bg_status_reg; 296 int i, timeout; 297 298 /* 299 * Sanitization is a special case which monopolizes the device 300 * and cannot be timesliced. Handle asynchronously instead, 301 * and allow userspace to poll(2) for completion. 302 */ 303 if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) { 304 if (mds->security.sanitize_active) 305 return -EBUSY; 306 307 /* give first timeout a second */ 308 timeout = 1; 309 mds->security.poll_tmo_secs = timeout; 310 mds->security.sanitize_active = true; 311 schedule_delayed_work(&mds->security.poll_dwork, 312 timeout * HZ); 313 dev_dbg(dev, "Sanitization operation started\n"); 314 goto success; 315 } 316 317 dev_dbg(dev, "Mailbox background operation (0x%04x) started\n", 318 mbox_cmd->opcode); 319 320 timeout = mbox_cmd->poll_interval_ms; 321 for (i = 0; i < mbox_cmd->poll_count; i++) { 322 if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait, 323 cxl_mbox_background_complete(cxlds), 324 TASK_UNINTERRUPTIBLE, 325 msecs_to_jiffies(timeout)) > 0) 326 break; 327 } 328 329 if (!cxl_mbox_background_complete(cxlds)) { 330 dev_err(dev, "timeout waiting for background (%d ms)\n", 331 timeout * mbox_cmd->poll_count); 332 return -ETIMEDOUT; 333 } 334 335 bg_status_reg = readq(cxlds->regs.mbox + 336 CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 337 mbox_cmd->return_code = 338 FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK, 339 bg_status_reg); 340 dev_dbg(dev, 341 "Mailbox background operation (0x%04x) completed\n", 342 mbox_cmd->opcode); 343 } 344 345 if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) { 346 dev_dbg(dev, "Mailbox operation had an error: %s\n", 347 cxl_mbox_cmd_rc2str(mbox_cmd)); 348 return 0; /* completed but caller must check return_code */ 349 } 350 351 success: 352 /* #7 */ 353 cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 354 out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg); 355 356 /* #8 */ 357 if (out_len && mbox_cmd->payload_out) { 358 /* 359 * Sanitize the copy. If hardware misbehaves, out_len per the 360 * spec can actually be greater than the max allowed size (21 361 * bits available but spec defined 1M max). The caller also may 362 * have requested less data than the hardware supplied even 363 * within spec. 364 */ 365 size_t n; 366 367 n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len); 368 memcpy_fromio(mbox_cmd->payload_out, payload, n); 369 mbox_cmd->size_out = n; 370 } else { 371 mbox_cmd->size_out = 0; 372 } 373 374 return 0; 375 } 376 377 static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox, 378 struct cxl_mbox_cmd *cmd) 379 { 380 int rc; 381 382 mutex_lock_io(&cxl_mbox->mbox_mutex); 383 rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd); 384 mutex_unlock(&cxl_mbox->mbox_mutex); 385 386 return rc; 387 } 388 389 static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) 390 { 391 struct cxl_dev_state *cxlds = &mds->cxlds; 392 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 393 const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET); 394 struct device *dev = cxlds->dev; 395 unsigned long timeout; 396 int irq, msgnum; 397 u64 md_status; 398 u32 ctrl; 399 400 timeout = jiffies + mbox_ready_timeout * HZ; 401 do { 402 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 403 if (md_status & CXLMDEV_MBOX_IF_READY) 404 break; 405 if (msleep_interruptible(100)) 406 break; 407 } while (!time_after(jiffies, timeout)); 408 409 if (!(md_status & CXLMDEV_MBOX_IF_READY)) { 410 cxl_err(dev, md_status, "timeout awaiting mailbox ready"); 411 return -ETIMEDOUT; 412 } 413 414 /* 415 * A command may be in flight from a previous driver instance, 416 * think kexec, do one doorbell wait so that 417 * __cxl_pci_mbox_send_cmd() can assume that it is the only 418 * source for future doorbell busy events. 419 */ 420 if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) { 421 cxl_err(dev, md_status, "timeout awaiting mailbox idle"); 422 return -ETIMEDOUT; 423 } 424 425 cxl_mbox->mbox_send = cxl_pci_mbox_send; 426 cxl_mbox->payload_size = 427 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap); 428 429 /* 430 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register 431 * 432 * If the size is too small, mandatory commands will not work and so 433 * there's no point in going forward. If the size is too large, there's 434 * no harm is soft limiting it. 435 */ 436 cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M); 437 if (cxl_mbox->payload_size < 256) { 438 dev_err(dev, "Mailbox is too small (%zub)", 439 cxl_mbox->payload_size); 440 return -ENXIO; 441 } 442 443 dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size); 444 445 INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work); 446 447 /* background command interrupts are optional */ 448 if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail) 449 return 0; 450 451 msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap); 452 irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum); 453 if (irq < 0) 454 return 0; 455 456 if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq)) 457 return 0; 458 459 dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n"); 460 /* enable background command mbox irq support */ 461 ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 462 ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ; 463 writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 464 465 return 0; 466 } 467 468 /* 469 * Assume that any RCIEP that emits the CXL memory expander class code 470 * is an RCD 471 */ 472 static bool is_cxl_restricted(struct pci_dev *pdev) 473 { 474 return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; 475 } 476 477 static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, 478 struct cxl_register_map *map, 479 struct cxl_dport *dport) 480 { 481 resource_size_t component_reg_phys; 482 483 *map = (struct cxl_register_map) { 484 .host = &pdev->dev, 485 .resource = CXL_RESOURCE_NONE, 486 }; 487 488 struct cxl_port *port __free(put_cxl_port) = 489 cxl_pci_find_port(pdev, &dport); 490 if (!port) 491 return -EPROBE_DEFER; 492 493 component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); 494 if (component_reg_phys == CXL_RESOURCE_NONE) 495 return -ENXIO; 496 497 map->resource = component_reg_phys; 498 map->reg_type = CXL_REGLOC_RBI_COMPONENT; 499 map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; 500 501 return 0; 502 } 503 504 static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, 505 struct cxl_register_map *map) 506 { 507 int rc; 508 509 rc = cxl_find_regblock(pdev, type, map); 510 511 /* 512 * If the Register Locator DVSEC does not exist, check if it 513 * is an RCH and try to extract the Component Registers from 514 * an RCRB. 515 */ 516 if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { 517 struct cxl_dport *dport; 518 struct cxl_port *port __free(put_cxl_port) = 519 cxl_pci_find_port(pdev, &dport); 520 if (!port) 521 return -EPROBE_DEFER; 522 523 rc = cxl_rcrb_get_comp_regs(pdev, map, dport); 524 if (rc) 525 return rc; 526 527 rc = cxl_dport_map_rcd_linkcap(pdev, dport); 528 if (rc) 529 return rc; 530 531 } else if (rc) { 532 return rc; 533 } 534 535 return cxl_setup_regs(map); 536 } 537 538 static int cxl_pci_ras_unmask(struct pci_dev *pdev) 539 { 540 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 541 void __iomem *addr; 542 u32 orig_val, val, mask; 543 u16 cap; 544 int rc; 545 546 if (!cxlds->regs.ras) { 547 dev_dbg(&pdev->dev, "No RAS registers.\n"); 548 return 0; 549 } 550 551 /* BIOS has PCIe AER error control */ 552 if (!pcie_aer_is_native(pdev)) 553 return 0; 554 555 rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap); 556 if (rc) 557 return rc; 558 559 if (cap & PCI_EXP_DEVCTL_URRE) { 560 addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET; 561 orig_val = readl(addr); 562 563 mask = CXL_RAS_UNCORRECTABLE_MASK_MASK | 564 CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK; 565 val = orig_val & ~mask; 566 writel(val, addr); 567 dev_dbg(&pdev->dev, 568 "Uncorrectable RAS Errors Mask: %#x -> %#x\n", 569 orig_val, val); 570 } 571 572 if (cap & PCI_EXP_DEVCTL_CERE) { 573 addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET; 574 orig_val = readl(addr); 575 val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK; 576 writel(val, addr); 577 dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n", 578 orig_val, val); 579 } 580 581 return 0; 582 } 583 584 static void free_event_buf(void *buf) 585 { 586 kvfree(buf); 587 } 588 589 /* 590 * There is a single buffer for reading event logs from the mailbox. All logs 591 * share this buffer protected by the mds->event_log_lock. 592 */ 593 static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds) 594 { 595 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 596 struct cxl_get_event_payload *buf; 597 598 buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); 599 if (!buf) 600 return -ENOMEM; 601 mds->event.buf = buf; 602 603 return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf); 604 } 605 606 static bool cxl_alloc_irq_vectors(struct pci_dev *pdev) 607 { 608 int nvecs; 609 610 /* 611 * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must 612 * not generate INTx messages if that function participates in 613 * CXL.cache or CXL.mem. 614 * 615 * Additionally pci_alloc_irq_vectors() handles calling 616 * pci_free_irq_vectors() automatically despite not being called 617 * pcim_*. See pci_setup_msi_context(). 618 */ 619 nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS, 620 PCI_IRQ_MSIX | PCI_IRQ_MSI); 621 if (nvecs < 1) { 622 dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs); 623 return false; 624 } 625 return true; 626 } 627 628 static irqreturn_t cxl_event_thread(int irq, void *id) 629 { 630 struct cxl_dev_id *dev_id = id; 631 struct cxl_dev_state *cxlds = dev_id->cxlds; 632 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 633 u32 status; 634 635 do { 636 /* 637 * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status; 638 * ignore the reserved upper 32 bits 639 */ 640 status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET); 641 /* Ignore logs unknown to the driver */ 642 status &= CXLDEV_EVENT_STATUS_ALL; 643 if (!status) 644 break; 645 cxl_mem_get_event_records(mds, status); 646 cond_resched(); 647 } while (status); 648 649 return IRQ_HANDLED; 650 } 651 652 static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting) 653 { 654 struct pci_dev *pdev = to_pci_dev(cxlds->dev); 655 int irq; 656 657 if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX) 658 return -ENXIO; 659 660 irq = pci_irq_vector(pdev, 661 FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting)); 662 if (irq < 0) 663 return irq; 664 665 return cxl_request_irq(cxlds, irq, cxl_event_thread); 666 } 667 668 static int cxl_event_get_int_policy(struct cxl_memdev_state *mds, 669 struct cxl_event_interrupt_policy *policy) 670 { 671 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 672 struct cxl_mbox_cmd mbox_cmd = { 673 .opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY, 674 .payload_out = policy, 675 .size_out = sizeof(*policy), 676 }; 677 int rc; 678 679 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 680 if (rc < 0) 681 dev_err(mds->cxlds.dev, 682 "Failed to get event interrupt policy : %d", rc); 683 684 return rc; 685 } 686 687 static int cxl_event_config_msgnums(struct cxl_memdev_state *mds, 688 struct cxl_event_interrupt_policy *policy) 689 { 690 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 691 struct cxl_mbox_cmd mbox_cmd; 692 int rc; 693 694 *policy = (struct cxl_event_interrupt_policy) { 695 .info_settings = CXL_INT_MSI_MSIX, 696 .warn_settings = CXL_INT_MSI_MSIX, 697 .failure_settings = CXL_INT_MSI_MSIX, 698 .fatal_settings = CXL_INT_MSI_MSIX, 699 }; 700 701 mbox_cmd = (struct cxl_mbox_cmd) { 702 .opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY, 703 .payload_in = policy, 704 .size_in = sizeof(*policy), 705 }; 706 707 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 708 if (rc < 0) { 709 dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d", 710 rc); 711 return rc; 712 } 713 714 /* Retrieve final interrupt settings */ 715 return cxl_event_get_int_policy(mds, policy); 716 } 717 718 static int cxl_event_irqsetup(struct cxl_memdev_state *mds) 719 { 720 struct cxl_dev_state *cxlds = &mds->cxlds; 721 struct cxl_event_interrupt_policy policy; 722 int rc; 723 724 rc = cxl_event_config_msgnums(mds, &policy); 725 if (rc) 726 return rc; 727 728 rc = cxl_event_req_irq(cxlds, policy.info_settings); 729 if (rc) { 730 dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n"); 731 return rc; 732 } 733 734 rc = cxl_event_req_irq(cxlds, policy.warn_settings); 735 if (rc) { 736 dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n"); 737 return rc; 738 } 739 740 rc = cxl_event_req_irq(cxlds, policy.failure_settings); 741 if (rc) { 742 dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n"); 743 return rc; 744 } 745 746 rc = cxl_event_req_irq(cxlds, policy.fatal_settings); 747 if (rc) { 748 dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n"); 749 return rc; 750 } 751 752 return 0; 753 } 754 755 static bool cxl_event_int_is_fw(u8 setting) 756 { 757 u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting); 758 759 return mode == CXL_INT_FW; 760 } 761 762 static int cxl_event_config(struct pci_host_bridge *host_bridge, 763 struct cxl_memdev_state *mds, bool irq_avail) 764 { 765 struct cxl_event_interrupt_policy policy; 766 int rc; 767 768 /* 769 * When BIOS maintains CXL error reporting control, it will process 770 * event records. Only one agent can do so. 771 */ 772 if (!host_bridge->native_cxl_error) 773 return 0; 774 775 if (!irq_avail) { 776 dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n"); 777 return 0; 778 } 779 780 rc = cxl_event_get_int_policy(mds, &policy); 781 if (rc) 782 return rc; 783 784 if (cxl_event_int_is_fw(policy.info_settings) || 785 cxl_event_int_is_fw(policy.warn_settings) || 786 cxl_event_int_is_fw(policy.failure_settings) || 787 cxl_event_int_is_fw(policy.fatal_settings)) { 788 dev_err(mds->cxlds.dev, 789 "FW still in control of Event Logs despite _OSC settings\n"); 790 return -EBUSY; 791 } 792 793 rc = cxl_mem_alloc_event_buf(mds); 794 if (rc) 795 return rc; 796 797 rc = cxl_event_irqsetup(mds); 798 if (rc) 799 return rc; 800 801 cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL); 802 803 return 0; 804 } 805 806 static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds) 807 { 808 int rc; 809 810 /* 811 * Fail the init if there's no mailbox. For a type3 this is out of spec. 812 */ 813 if (!cxlds->reg_map.device_map.mbox.valid) 814 return -ENODEV; 815 816 rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev); 817 if (rc) 818 return rc; 819 820 return 0; 821 } 822 823 static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size_t width) 824 { 825 struct cxl_dev_state *cxlds = dev_get_drvdata(dev); 826 struct cxl_memdev *cxlmd = cxlds->cxlmd; 827 struct device *root_dev; 828 struct cxl_dport *dport; 829 struct cxl_port *root __free(put_cxl_port) = 830 cxl_mem_find_port(cxlmd, &dport); 831 832 if (!root) 833 return -ENXIO; 834 835 root_dev = root->uport_dev; 836 if (!root_dev) 837 return -ENXIO; 838 839 if (!dport->regs.rcd_pcie_cap) 840 return -ENXIO; 841 842 guard(device)(root_dev); 843 if (!root_dev->driver) 844 return -ENXIO; 845 846 switch (width) { 847 case 2: 848 return sysfs_emit(buf, "%#x\n", 849 readw(dport->regs.rcd_pcie_cap + offset)); 850 case 4: 851 return sysfs_emit(buf, "%#x\n", 852 readl(dport->regs.rcd_pcie_cap + offset)); 853 default: 854 return -EINVAL; 855 } 856 } 857 858 static ssize_t rcd_link_cap_show(struct device *dev, 859 struct device_attribute *attr, char *buf) 860 { 861 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCAP, buf, sizeof(u32)); 862 } 863 static DEVICE_ATTR_RO(rcd_link_cap); 864 865 static ssize_t rcd_link_ctrl_show(struct device *dev, 866 struct device_attribute *attr, char *buf) 867 { 868 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCTL, buf, sizeof(u16)); 869 } 870 static DEVICE_ATTR_RO(rcd_link_ctrl); 871 872 static ssize_t rcd_link_status_show(struct device *dev, 873 struct device_attribute *attr, char *buf) 874 { 875 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKSTA, buf, sizeof(u16)); 876 } 877 static DEVICE_ATTR_RO(rcd_link_status); 878 879 static struct attribute *cxl_rcd_attrs[] = { 880 &dev_attr_rcd_link_cap.attr, 881 &dev_attr_rcd_link_ctrl.attr, 882 &dev_attr_rcd_link_status.attr, 883 NULL 884 }; 885 886 static umode_t cxl_rcd_visible(struct kobject *kobj, struct attribute *a, int n) 887 { 888 struct device *dev = kobj_to_dev(kobj); 889 struct pci_dev *pdev = to_pci_dev(dev); 890 891 if (is_cxl_restricted(pdev)) 892 return a->mode; 893 894 return 0; 895 } 896 897 static struct attribute_group cxl_rcd_group = { 898 .attrs = cxl_rcd_attrs, 899 .is_visible = cxl_rcd_visible, 900 }; 901 __ATTRIBUTE_GROUPS(cxl_rcd); 902 903 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 904 { 905 struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus); 906 struct cxl_memdev_state *mds; 907 struct cxl_dev_state *cxlds; 908 struct cxl_register_map map; 909 struct cxl_memdev *cxlmd; 910 int i, rc, pmu_count; 911 bool irq_avail; 912 913 /* 914 * Double check the anonymous union trickery in struct cxl_regs 915 * FIXME switch to struct_group() 916 */ 917 BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != 918 offsetof(struct cxl_regs, device_regs.memdev)); 919 920 rc = pcim_enable_device(pdev); 921 if (rc) 922 return rc; 923 pci_set_master(pdev); 924 925 mds = cxl_memdev_state_create(&pdev->dev); 926 if (IS_ERR(mds)) 927 return PTR_ERR(mds); 928 cxlds = &mds->cxlds; 929 pci_set_drvdata(pdev, cxlds); 930 931 cxlds->rcd = is_cxl_restricted(pdev); 932 cxlds->serial = pci_get_dsn(pdev); 933 cxlds->cxl_dvsec = pci_find_dvsec_capability( 934 pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); 935 if (!cxlds->cxl_dvsec) 936 dev_warn(&pdev->dev, 937 "Device DVSEC not present, skip CXL.mem init\n"); 938 939 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); 940 if (rc) 941 return rc; 942 943 rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs); 944 if (rc) 945 return rc; 946 947 /* 948 * If the component registers can't be found, the cxl_pci driver may 949 * still be useful for management functions so don't return an error. 950 */ 951 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, 952 &cxlds->reg_map); 953 if (rc) 954 dev_warn(&pdev->dev, "No component registers (%d)\n", rc); 955 else if (!cxlds->reg_map.component_map.ras.valid) 956 dev_dbg(&pdev->dev, "RAS registers not found\n"); 957 958 rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component, 959 BIT(CXL_CM_CAP_CAP_ID_RAS)); 960 if (rc) 961 dev_dbg(&pdev->dev, "Failed to map RAS capability.\n"); 962 963 rc = cxl_pci_type3_init_mailbox(cxlds); 964 if (rc) 965 return rc; 966 967 rc = cxl_await_media_ready(cxlds); 968 if (rc == 0) 969 cxlds->media_ready = true; 970 else 971 dev_warn(&pdev->dev, "Media not active (%d)\n", rc); 972 973 irq_avail = cxl_alloc_irq_vectors(pdev); 974 975 rc = cxl_pci_setup_mailbox(mds, irq_avail); 976 if (rc) 977 return rc; 978 979 rc = cxl_enumerate_cmds(mds); 980 if (rc) 981 return rc; 982 983 rc = cxl_set_timestamp(mds); 984 if (rc) 985 return rc; 986 987 rc = cxl_poison_state_init(mds); 988 if (rc) 989 return rc; 990 991 rc = cxl_dev_state_identify(mds); 992 if (rc) 993 return rc; 994 995 rc = cxl_mem_create_range_info(mds); 996 if (rc) 997 return rc; 998 999 cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); 1000 if (IS_ERR(cxlmd)) 1001 return PTR_ERR(cxlmd); 1002 1003 rc = devm_cxl_setup_fw_upload(&pdev->dev, mds); 1004 if (rc) 1005 return rc; 1006 1007 rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd); 1008 if (rc) 1009 return rc; 1010 1011 pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU); 1012 for (i = 0; i < pmu_count; i++) { 1013 struct cxl_pmu_regs pmu_regs; 1014 1015 rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i); 1016 if (rc) { 1017 dev_dbg(&pdev->dev, "Could not find PMU regblock\n"); 1018 break; 1019 } 1020 1021 rc = cxl_map_pmu_regs(&map, &pmu_regs); 1022 if (rc) { 1023 dev_dbg(&pdev->dev, "Could not map PMU regs\n"); 1024 break; 1025 } 1026 1027 rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV); 1028 if (rc) { 1029 dev_dbg(&pdev->dev, "Could not add PMU instance\n"); 1030 break; 1031 } 1032 } 1033 1034 rc = cxl_event_config(host_bridge, mds, irq_avail); 1035 if (rc) 1036 return rc; 1037 1038 if (cxl_pci_ras_unmask(pdev)) 1039 dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); 1040 1041 pci_save_state(pdev); 1042 1043 return rc; 1044 } 1045 1046 static const struct pci_device_id cxl_mem_pci_tbl[] = { 1047 /* PCI class code for CXL.mem Type-3 Devices */ 1048 { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)}, 1049 { /* terminate list */ }, 1050 }; 1051 MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl); 1052 1053 static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev) 1054 { 1055 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1056 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1057 struct device *dev = &cxlmd->dev; 1058 1059 dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n", 1060 dev_name(dev)); 1061 pci_restore_state(pdev); 1062 if (device_attach(dev) <= 0) 1063 return PCI_ERS_RESULT_DISCONNECT; 1064 return PCI_ERS_RESULT_RECOVERED; 1065 } 1066 1067 static void cxl_error_resume(struct pci_dev *pdev) 1068 { 1069 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1070 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1071 struct device *dev = &cxlmd->dev; 1072 1073 dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev), 1074 dev->driver ? "successful" : "failed"); 1075 } 1076 1077 static void cxl_reset_done(struct pci_dev *pdev) 1078 { 1079 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1080 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1081 struct device *dev = &pdev->dev; 1082 1083 /* 1084 * FLR does not expect to touch the HDM decoders and related 1085 * registers. SBR, however, will wipe all device configurations. 1086 * Issue a warning if there was an active decoder before the reset 1087 * that no longer exists. 1088 */ 1089 guard(device)(&cxlmd->dev); 1090 if (cxlmd->endpoint && 1091 cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { 1092 dev_crit(dev, "SBR happened without memory regions removal.\n"); 1093 dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); 1094 add_taint(TAINT_USER, LOCKDEP_STILL_OK); 1095 } 1096 } 1097 1098 static const struct pci_error_handlers cxl_error_handlers = { 1099 .error_detected = cxl_error_detected, 1100 .slot_reset = cxl_slot_reset, 1101 .resume = cxl_error_resume, 1102 .cor_error_detected = cxl_cor_error_detected, 1103 .reset_done = cxl_reset_done, 1104 }; 1105 1106 static struct pci_driver cxl_pci_driver = { 1107 .name = KBUILD_MODNAME, 1108 .id_table = cxl_mem_pci_tbl, 1109 .probe = cxl_pci_probe, 1110 .err_handler = &cxl_error_handlers, 1111 .dev_groups = cxl_rcd_groups, 1112 .driver = { 1113 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1114 }, 1115 }; 1116 1117 #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0) 1118 static void cxl_handle_cper_event(enum cxl_event_type ev_type, 1119 struct cxl_cper_event_rec *rec) 1120 { 1121 struct cper_cxl_event_devid *device_id = &rec->hdr.device_id; 1122 struct pci_dev *pdev __free(pci_dev_put) = NULL; 1123 enum cxl_event_log_type log_type; 1124 struct cxl_dev_state *cxlds; 1125 unsigned int devfn; 1126 u32 hdr_flags; 1127 1128 pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type, 1129 device_id->segment_num, device_id->bus_num, 1130 device_id->device_num, device_id->func_num); 1131 1132 devfn = PCI_DEVFN(device_id->device_num, device_id->func_num); 1133 pdev = pci_get_domain_bus_and_slot(device_id->segment_num, 1134 device_id->bus_num, devfn); 1135 if (!pdev) 1136 return; 1137 1138 guard(device)(&pdev->dev); 1139 if (pdev->driver != &cxl_pci_driver) 1140 return; 1141 1142 cxlds = pci_get_drvdata(pdev); 1143 if (!cxlds) 1144 return; 1145 1146 /* Fabricate a log type */ 1147 hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags); 1148 log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags); 1149 1150 cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type, 1151 &uuid_null, &rec->event); 1152 } 1153 1154 static void cxl_cper_work_fn(struct work_struct *work) 1155 { 1156 struct cxl_cper_work_data wd; 1157 1158 while (cxl_cper_kfifo_get(&wd)) 1159 cxl_handle_cper_event(wd.event_type, &wd.rec); 1160 } 1161 static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); 1162 1163 static int __init cxl_pci_driver_init(void) 1164 { 1165 int rc; 1166 1167 rc = pci_register_driver(&cxl_pci_driver); 1168 if (rc) 1169 return rc; 1170 1171 rc = cxl_cper_register_work(&cxl_cper_work); 1172 if (rc) 1173 pci_unregister_driver(&cxl_pci_driver); 1174 1175 return rc; 1176 } 1177 1178 static void __exit cxl_pci_driver_exit(void) 1179 { 1180 cxl_cper_unregister_work(&cxl_cper_work); 1181 cancel_work_sync(&cxl_cper_work); 1182 pci_unregister_driver(&cxl_pci_driver); 1183 } 1184 1185 module_init(cxl_pci_driver_init); 1186 module_exit(cxl_pci_driver_exit); 1187 MODULE_DESCRIPTION("CXL: PCI manageability"); 1188 MODULE_LICENSE("GPL v2"); 1189 MODULE_IMPORT_NS("CXL"); 1190