1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ 3 #include <linux/unaligned.h> 4 #include <linux/io-64-nonatomic-lo-hi.h> 5 #include <linux/moduleparam.h> 6 #include <linux/module.h> 7 #include <linux/delay.h> 8 #include <linux/sizes.h> 9 #include <linux/mutex.h> 10 #include <linux/list.h> 11 #include <linux/pci.h> 12 #include <linux/aer.h> 13 #include <linux/io.h> 14 #include <cxl/mailbox.h> 15 #include "cxlmem.h" 16 #include "cxlpci.h" 17 #include "cxl.h" 18 #include "pmu.h" 19 20 /** 21 * DOC: cxl pci 22 * 23 * This implements the PCI exclusive functionality for a CXL device as it is 24 * defined by the Compute Express Link specification. CXL devices may surface 25 * certain functionality even if it isn't CXL enabled. While this driver is 26 * focused around the PCI specific aspects of a CXL device, it binds to the 27 * specific CXL memory device class code, and therefore the implementation of 28 * cxl_pci is focused around CXL memory devices. 29 * 30 * The driver has several responsibilities, mainly: 31 * - Create the memX device and register on the CXL bus. 32 * - Enumerate device's register interface and map them. 33 * - Registers nvdimm bridge device with cxl_core. 34 * - Registers a CXL mailbox with cxl_core. 35 */ 36 37 #define cxl_doorbell_busy(cxlds) \ 38 (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \ 39 CXLDEV_MBOX_CTRL_DOORBELL) 40 41 /* CXL 2.0 - 8.2.8.4 */ 42 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ) 43 44 /* 45 * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to 46 * dictate how long to wait for the mailbox to become ready. The new 47 * field allows the device to tell software the amount of time to wait 48 * before mailbox ready. This field per the spec theoretically allows 49 * for up to 255 seconds. 255 seconds is unreasonably long, its longer 50 * than the maximum SATA port link recovery wait. Default to 60 seconds 51 * until someone builds a CXL device that needs more time in practice. 52 */ 53 static unsigned short mbox_ready_timeout = 60; 54 module_param(mbox_ready_timeout, ushort, 0644); 55 MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready"); 56 57 static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) 58 { 59 const unsigned long start = jiffies; 60 unsigned long end = start; 61 62 while (cxl_doorbell_busy(cxlds)) { 63 end = jiffies; 64 65 if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) { 66 /* Check again in case preempted before timeout test */ 67 if (!cxl_doorbell_busy(cxlds)) 68 break; 69 return -ETIMEDOUT; 70 } 71 cpu_relax(); 72 } 73 74 dev_dbg(cxlds->dev, "Doorbell wait took %dms", 75 jiffies_to_msecs(end) - jiffies_to_msecs(start)); 76 return 0; 77 } 78 79 #define cxl_err(dev, status, msg) \ 80 dev_err_ratelimited(dev, msg ", device state %s%s\n", \ 81 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 82 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 83 84 #define cxl_cmd_err(dev, cmd, status, msg) \ 85 dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \ 86 (cmd)->opcode, \ 87 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 88 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 89 90 /* 91 * Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique 92 * wrapper object for each irq within the same cxlds. 93 */ 94 struct cxl_dev_id { 95 struct cxl_dev_state *cxlds; 96 }; 97 98 static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq, 99 irq_handler_t thread_fn) 100 { 101 struct device *dev = cxlds->dev; 102 struct cxl_dev_id *dev_id; 103 104 dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL); 105 if (!dev_id) 106 return -ENOMEM; 107 dev_id->cxlds = cxlds; 108 109 return devm_request_threaded_irq(dev, irq, NULL, thread_fn, 110 IRQF_SHARED | IRQF_ONESHOT, NULL, 111 dev_id); 112 } 113 114 static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds) 115 { 116 u64 reg; 117 118 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 119 return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100; 120 } 121 122 static irqreturn_t cxl_pci_mbox_irq(int irq, void *id) 123 { 124 u64 reg; 125 u16 opcode; 126 struct cxl_dev_id *dev_id = id; 127 struct cxl_dev_state *cxlds = dev_id->cxlds; 128 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 129 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 130 131 if (!cxl_mbox_background_complete(cxlds)) 132 return IRQ_NONE; 133 134 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 135 opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg); 136 if (opcode == CXL_MBOX_OP_SANITIZE) { 137 mutex_lock(&cxl_mbox->mbox_mutex); 138 if (mds->security.sanitize_node) 139 mod_delayed_work(system_wq, &mds->security.poll_dwork, 0); 140 mutex_unlock(&cxl_mbox->mbox_mutex); 141 } else { 142 /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */ 143 rcuwait_wake_up(&cxl_mbox->mbox_wait); 144 } 145 146 return IRQ_HANDLED; 147 } 148 149 /* 150 * Sanitization operation polling mode. 151 */ 152 static void cxl_mbox_sanitize_work(struct work_struct *work) 153 { 154 struct cxl_memdev_state *mds = 155 container_of(work, typeof(*mds), security.poll_dwork.work); 156 struct cxl_dev_state *cxlds = &mds->cxlds; 157 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 158 159 mutex_lock(&cxl_mbox->mbox_mutex); 160 if (cxl_mbox_background_complete(cxlds)) { 161 mds->security.poll_tmo_secs = 0; 162 if (mds->security.sanitize_node) 163 sysfs_notify_dirent(mds->security.sanitize_node); 164 mds->security.sanitize_active = false; 165 166 dev_dbg(cxlds->dev, "Sanitization operation ended\n"); 167 } else { 168 int timeout = mds->security.poll_tmo_secs + 10; 169 170 mds->security.poll_tmo_secs = min(15 * 60, timeout); 171 schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ); 172 } 173 mutex_unlock(&cxl_mbox->mbox_mutex); 174 } 175 176 /** 177 * __cxl_pci_mbox_send_cmd() - Execute a mailbox command 178 * @cxl_mbox: CXL mailbox context 179 * @mbox_cmd: Command to send to the memory device. 180 * 181 * Context: Any context. Expects mbox_mutex to be held. 182 * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success. 183 * Caller should check the return code in @mbox_cmd to make sure it 184 * succeeded. 185 * 186 * This is a generic form of the CXL mailbox send command thus only using the 187 * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory 188 * devices, and perhaps other types of CXL devices may have further information 189 * available upon error conditions. Driver facilities wishing to send mailbox 190 * commands should use the wrapper command. 191 * 192 * The CXL spec allows for up to two mailboxes. The intention is for the primary 193 * mailbox to be OS controlled and the secondary mailbox to be used by system 194 * firmware. This allows the OS and firmware to communicate with the device and 195 * not need to coordinate with each other. The driver only uses the primary 196 * mailbox. 197 */ 198 static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox, 199 struct cxl_mbox_cmd *mbox_cmd) 200 { 201 struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox); 202 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 203 void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET; 204 struct device *dev = cxlds->dev; 205 u64 cmd_reg, status_reg; 206 size_t out_len; 207 int rc; 208 209 lockdep_assert_held(&cxl_mbox->mbox_mutex); 210 211 /* 212 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec. 213 * 1. Caller reads MB Control Register to verify doorbell is clear 214 * 2. Caller writes Command Register 215 * 3. Caller writes Command Payload Registers if input payload is non-empty 216 * 4. Caller writes MB Control Register to set doorbell 217 * 5. Caller either polls for doorbell to be clear or waits for interrupt if configured 218 * 6. Caller reads MB Status Register to fetch Return code 219 * 7. If command successful, Caller reads Command Register to get Payload Length 220 * 8. If output payload is non-empty, host reads Command Payload Registers 221 * 222 * Hardware is free to do whatever it wants before the doorbell is rung, 223 * and isn't allowed to change anything after it clears the doorbell. As 224 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can 225 * also happen in any order (though some orders might not make sense). 226 */ 227 228 /* #1 */ 229 if (cxl_doorbell_busy(cxlds)) { 230 u64 md_status = 231 readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 232 233 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, 234 "mailbox queue busy"); 235 return -EBUSY; 236 } 237 238 /* 239 * With sanitize polling, hardware might be done and the poller still 240 * not be in sync. Ensure no new command comes in until so. Keep the 241 * hardware semantics and only allow device health status. 242 */ 243 if (mds->security.poll_tmo_secs > 0) { 244 if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO) 245 return -EBUSY; 246 } 247 248 cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK, 249 mbox_cmd->opcode); 250 if (mbox_cmd->size_in) { 251 if (WARN_ON(!mbox_cmd->payload_in)) 252 return -EINVAL; 253 254 cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, 255 mbox_cmd->size_in); 256 memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in); 257 } 258 259 /* #2, #3 */ 260 writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 261 262 /* #4 */ 263 dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode); 264 writel(CXLDEV_MBOX_CTRL_DOORBELL, 265 cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 266 267 /* #5 */ 268 rc = cxl_pci_mbox_wait_for_doorbell(cxlds); 269 if (rc == -ETIMEDOUT) { 270 u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 271 272 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout"); 273 return rc; 274 } 275 276 /* #6 */ 277 status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET); 278 mbox_cmd->return_code = 279 FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg); 280 281 /* 282 * Handle the background command in a synchronous manner. 283 * 284 * All other mailbox commands will serialize/queue on the mbox_mutex, 285 * which we currently hold. Furthermore this also guarantees that 286 * cxl_mbox_background_complete() checks are safe amongst each other, 287 * in that no new bg operation can occur in between. 288 * 289 * Background operations are timesliced in accordance with the nature 290 * of the command. In the event of timeout, the mailbox state is 291 * indeterminate until the next successful command submission and the 292 * driver can get back in sync with the hardware state. 293 */ 294 if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) { 295 u64 bg_status_reg; 296 int i, timeout; 297 298 /* 299 * Sanitization is a special case which monopolizes the device 300 * and cannot be timesliced. Handle asynchronously instead, 301 * and allow userspace to poll(2) for completion. 302 */ 303 if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) { 304 if (mds->security.sanitize_active) 305 return -EBUSY; 306 307 /* give first timeout a second */ 308 timeout = 1; 309 mds->security.poll_tmo_secs = timeout; 310 mds->security.sanitize_active = true; 311 schedule_delayed_work(&mds->security.poll_dwork, 312 timeout * HZ); 313 dev_dbg(dev, "Sanitization operation started\n"); 314 goto success; 315 } 316 317 dev_dbg(dev, "Mailbox background operation (0x%04x) started\n", 318 mbox_cmd->opcode); 319 320 timeout = mbox_cmd->poll_interval_ms; 321 for (i = 0; i < mbox_cmd->poll_count; i++) { 322 if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait, 323 cxl_mbox_background_complete(cxlds), 324 TASK_UNINTERRUPTIBLE, 325 msecs_to_jiffies(timeout)) > 0) 326 break; 327 } 328 329 if (!cxl_mbox_background_complete(cxlds)) { 330 dev_err(dev, "timeout waiting for background (%d ms)\n", 331 timeout * mbox_cmd->poll_count); 332 return -ETIMEDOUT; 333 } 334 335 bg_status_reg = readq(cxlds->regs.mbox + 336 CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 337 mbox_cmd->return_code = 338 FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK, 339 bg_status_reg); 340 dev_dbg(dev, 341 "Mailbox background operation (0x%04x) completed\n", 342 mbox_cmd->opcode); 343 } 344 345 if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) { 346 dev_dbg(dev, "Mailbox operation had an error: %s\n", 347 cxl_mbox_cmd_rc2str(mbox_cmd)); 348 return 0; /* completed but caller must check return_code */ 349 } 350 351 success: 352 /* #7 */ 353 cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 354 out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg); 355 356 /* #8 */ 357 if (out_len && mbox_cmd->payload_out) { 358 /* 359 * Sanitize the copy. If hardware misbehaves, out_len per the 360 * spec can actually be greater than the max allowed size (21 361 * bits available but spec defined 1M max). The caller also may 362 * have requested less data than the hardware supplied even 363 * within spec. 364 */ 365 size_t n; 366 367 n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len); 368 memcpy_fromio(mbox_cmd->payload_out, payload, n); 369 mbox_cmd->size_out = n; 370 } else { 371 mbox_cmd->size_out = 0; 372 } 373 374 return 0; 375 } 376 377 static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox, 378 struct cxl_mbox_cmd *cmd) 379 { 380 int rc; 381 382 mutex_lock_io(&cxl_mbox->mbox_mutex); 383 rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd); 384 mutex_unlock(&cxl_mbox->mbox_mutex); 385 386 return rc; 387 } 388 389 static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) 390 { 391 struct cxl_dev_state *cxlds = &mds->cxlds; 392 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 393 const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET); 394 struct device *dev = cxlds->dev; 395 unsigned long timeout; 396 int irq, msgnum; 397 u64 md_status; 398 u32 ctrl; 399 400 timeout = jiffies + mbox_ready_timeout * HZ; 401 do { 402 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 403 if (md_status & CXLMDEV_MBOX_IF_READY) 404 break; 405 if (msleep_interruptible(100)) 406 break; 407 } while (!time_after(jiffies, timeout)); 408 409 if (!(md_status & CXLMDEV_MBOX_IF_READY)) { 410 cxl_err(dev, md_status, "timeout awaiting mailbox ready"); 411 return -ETIMEDOUT; 412 } 413 414 /* 415 * A command may be in flight from a previous driver instance, 416 * think kexec, do one doorbell wait so that 417 * __cxl_pci_mbox_send_cmd() can assume that it is the only 418 * source for future doorbell busy events. 419 */ 420 if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) { 421 cxl_err(dev, md_status, "timeout awaiting mailbox idle"); 422 return -ETIMEDOUT; 423 } 424 425 cxl_mbox->mbox_send = cxl_pci_mbox_send; 426 cxl_mbox->payload_size = 427 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap); 428 429 /* 430 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register 431 * 432 * If the size is too small, mandatory commands will not work and so 433 * there's no point in going forward. If the size is too large, there's 434 * no harm is soft limiting it. 435 */ 436 cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M); 437 if (cxl_mbox->payload_size < 256) { 438 dev_err(dev, "Mailbox is too small (%zub)", 439 cxl_mbox->payload_size); 440 return -ENXIO; 441 } 442 443 dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size); 444 445 INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work); 446 447 /* background command interrupts are optional */ 448 if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail) 449 return 0; 450 451 msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap); 452 irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum); 453 if (irq < 0) 454 return 0; 455 456 if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq)) 457 return 0; 458 459 dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n"); 460 /* enable background command mbox irq support */ 461 ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 462 ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ; 463 writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 464 465 return 0; 466 } 467 468 /* 469 * Assume that any RCIEP that emits the CXL memory expander class code 470 * is an RCD 471 */ 472 static bool is_cxl_restricted(struct pci_dev *pdev) 473 { 474 return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; 475 } 476 477 static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, 478 struct cxl_register_map *map, 479 struct cxl_dport *dport) 480 { 481 resource_size_t component_reg_phys; 482 483 *map = (struct cxl_register_map) { 484 .host = &pdev->dev, 485 .resource = CXL_RESOURCE_NONE, 486 }; 487 488 struct cxl_port *port __free(put_cxl_port) = 489 cxl_pci_find_port(pdev, &dport); 490 if (!port) 491 return -EPROBE_DEFER; 492 493 component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); 494 if (component_reg_phys == CXL_RESOURCE_NONE) 495 return -ENXIO; 496 497 map->resource = component_reg_phys; 498 map->reg_type = CXL_REGLOC_RBI_COMPONENT; 499 map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; 500 501 return 0; 502 } 503 504 static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, 505 struct cxl_register_map *map) 506 { 507 int rc; 508 509 rc = cxl_find_regblock(pdev, type, map); 510 511 /* 512 * If the Register Locator DVSEC does not exist, check if it 513 * is an RCH and try to extract the Component Registers from 514 * an RCRB. 515 */ 516 if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) { 517 struct cxl_dport *dport; 518 struct cxl_port *port __free(put_cxl_port) = 519 cxl_pci_find_port(pdev, &dport); 520 if (!port) 521 return -EPROBE_DEFER; 522 523 rc = cxl_rcrb_get_comp_regs(pdev, map, dport); 524 if (rc) 525 return rc; 526 527 rc = cxl_dport_map_rcd_linkcap(pdev, dport); 528 if (rc) 529 return rc; 530 531 } else if (rc) { 532 return rc; 533 } 534 535 return cxl_setup_regs(map); 536 } 537 538 static int cxl_pci_ras_unmask(struct pci_dev *pdev) 539 { 540 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 541 void __iomem *addr; 542 u32 orig_val, val, mask; 543 u16 cap; 544 int rc; 545 546 if (!cxlds->regs.ras) { 547 dev_dbg(&pdev->dev, "No RAS registers.\n"); 548 return 0; 549 } 550 551 /* BIOS has PCIe AER error control */ 552 if (!pcie_aer_is_native(pdev)) 553 return 0; 554 555 rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap); 556 if (rc) 557 return rc; 558 559 if (cap & PCI_EXP_DEVCTL_URRE) { 560 addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET; 561 orig_val = readl(addr); 562 563 mask = CXL_RAS_UNCORRECTABLE_MASK_MASK | 564 CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK; 565 val = orig_val & ~mask; 566 writel(val, addr); 567 dev_dbg(&pdev->dev, 568 "Uncorrectable RAS Errors Mask: %#x -> %#x\n", 569 orig_val, val); 570 } 571 572 if (cap & PCI_EXP_DEVCTL_CERE) { 573 addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET; 574 orig_val = readl(addr); 575 val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK; 576 writel(val, addr); 577 dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n", 578 orig_val, val); 579 } 580 581 return 0; 582 } 583 584 static void free_event_buf(void *buf) 585 { 586 kvfree(buf); 587 } 588 589 /* 590 * There is a single buffer for reading event logs from the mailbox. All logs 591 * share this buffer protected by the mds->event_log_lock. 592 */ 593 static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds) 594 { 595 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 596 struct cxl_get_event_payload *buf; 597 598 buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); 599 if (!buf) 600 return -ENOMEM; 601 mds->event.buf = buf; 602 603 return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf); 604 } 605 606 static bool cxl_alloc_irq_vectors(struct pci_dev *pdev) 607 { 608 int nvecs; 609 610 /* 611 * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must 612 * not generate INTx messages if that function participates in 613 * CXL.cache or CXL.mem. 614 * 615 * Additionally pci_alloc_irq_vectors() handles calling 616 * pci_free_irq_vectors() automatically despite not being called 617 * pcim_*. See pci_setup_msi_context(). 618 */ 619 nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS, 620 PCI_IRQ_MSIX | PCI_IRQ_MSI); 621 if (nvecs < 1) { 622 dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs); 623 return false; 624 } 625 return true; 626 } 627 628 static irqreturn_t cxl_event_thread(int irq, void *id) 629 { 630 struct cxl_dev_id *dev_id = id; 631 struct cxl_dev_state *cxlds = dev_id->cxlds; 632 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 633 u32 status; 634 635 do { 636 /* 637 * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status; 638 * ignore the reserved upper 32 bits 639 */ 640 status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET); 641 /* Ignore logs unknown to the driver */ 642 status &= CXLDEV_EVENT_STATUS_ALL; 643 if (!status) 644 break; 645 cxl_mem_get_event_records(mds, status); 646 cond_resched(); 647 } while (status); 648 649 return IRQ_HANDLED; 650 } 651 652 static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting) 653 { 654 struct pci_dev *pdev = to_pci_dev(cxlds->dev); 655 int irq; 656 657 if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX) 658 return -ENXIO; 659 660 irq = pci_irq_vector(pdev, 661 FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting)); 662 if (irq < 0) 663 return irq; 664 665 return cxl_request_irq(cxlds, irq, cxl_event_thread); 666 } 667 668 static int cxl_event_get_int_policy(struct cxl_memdev_state *mds, 669 struct cxl_event_interrupt_policy *policy) 670 { 671 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 672 struct cxl_mbox_cmd mbox_cmd = { 673 .opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY, 674 .payload_out = policy, 675 .size_out = sizeof(*policy), 676 }; 677 int rc; 678 679 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 680 if (rc < 0) 681 dev_err(mds->cxlds.dev, 682 "Failed to get event interrupt policy : %d", rc); 683 684 return rc; 685 } 686 687 static int cxl_event_config_msgnums(struct cxl_memdev_state *mds, 688 struct cxl_event_interrupt_policy *policy) 689 { 690 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 691 struct cxl_mbox_cmd mbox_cmd; 692 int rc; 693 694 *policy = (struct cxl_event_interrupt_policy) { 695 .info_settings = CXL_INT_MSI_MSIX, 696 .warn_settings = CXL_INT_MSI_MSIX, 697 .failure_settings = CXL_INT_MSI_MSIX, 698 .fatal_settings = CXL_INT_MSI_MSIX, 699 }; 700 701 mbox_cmd = (struct cxl_mbox_cmd) { 702 .opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY, 703 .payload_in = policy, 704 .size_in = sizeof(*policy), 705 }; 706 707 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 708 if (rc < 0) { 709 dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d", 710 rc); 711 return rc; 712 } 713 714 /* Retrieve final interrupt settings */ 715 return cxl_event_get_int_policy(mds, policy); 716 } 717 718 static int cxl_event_irqsetup(struct cxl_memdev_state *mds) 719 { 720 struct cxl_dev_state *cxlds = &mds->cxlds; 721 struct cxl_event_interrupt_policy policy; 722 int rc; 723 724 rc = cxl_event_config_msgnums(mds, &policy); 725 if (rc) 726 return rc; 727 728 rc = cxl_event_req_irq(cxlds, policy.info_settings); 729 if (rc) { 730 dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n"); 731 return rc; 732 } 733 734 rc = cxl_event_req_irq(cxlds, policy.warn_settings); 735 if (rc) { 736 dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n"); 737 return rc; 738 } 739 740 rc = cxl_event_req_irq(cxlds, policy.failure_settings); 741 if (rc) { 742 dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n"); 743 return rc; 744 } 745 746 rc = cxl_event_req_irq(cxlds, policy.fatal_settings); 747 if (rc) { 748 dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n"); 749 return rc; 750 } 751 752 return 0; 753 } 754 755 static bool cxl_event_int_is_fw(u8 setting) 756 { 757 u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting); 758 759 return mode == CXL_INT_FW; 760 } 761 762 static int cxl_event_config(struct pci_host_bridge *host_bridge, 763 struct cxl_memdev_state *mds, bool irq_avail) 764 { 765 struct cxl_event_interrupt_policy policy; 766 int rc; 767 768 /* 769 * When BIOS maintains CXL error reporting control, it will process 770 * event records. Only one agent can do so. 771 */ 772 if (!host_bridge->native_cxl_error) 773 return 0; 774 775 if (!irq_avail) { 776 dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n"); 777 return 0; 778 } 779 780 rc = cxl_event_get_int_policy(mds, &policy); 781 if (rc) 782 return rc; 783 784 if (cxl_event_int_is_fw(policy.info_settings) || 785 cxl_event_int_is_fw(policy.warn_settings) || 786 cxl_event_int_is_fw(policy.failure_settings) || 787 cxl_event_int_is_fw(policy.fatal_settings)) { 788 dev_err(mds->cxlds.dev, 789 "FW still in control of Event Logs despite _OSC settings\n"); 790 return -EBUSY; 791 } 792 793 rc = cxl_mem_alloc_event_buf(mds); 794 if (rc) 795 return rc; 796 797 rc = cxl_event_irqsetup(mds); 798 if (rc) 799 return rc; 800 801 cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL); 802 803 return 0; 804 } 805 806 static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds) 807 { 808 int rc; 809 810 /* 811 * Fail the init if there's no mailbox. For a type3 this is out of spec. 812 */ 813 if (!cxlds->reg_map.device_map.mbox.valid) 814 return -ENODEV; 815 816 rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev); 817 if (rc) 818 return rc; 819 820 return 0; 821 } 822 823 static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size_t width) 824 { 825 struct cxl_dev_state *cxlds = dev_get_drvdata(dev); 826 struct cxl_memdev *cxlmd = cxlds->cxlmd; 827 struct device *root_dev; 828 struct cxl_dport *dport; 829 struct cxl_port *root __free(put_cxl_port) = 830 cxl_mem_find_port(cxlmd, &dport); 831 832 if (!root) 833 return -ENXIO; 834 835 root_dev = root->uport_dev; 836 if (!root_dev) 837 return -ENXIO; 838 839 guard(device)(root_dev); 840 if (!root_dev->driver) 841 return -ENXIO; 842 843 switch (width) { 844 case 2: 845 return sysfs_emit(buf, "%#x\n", 846 readw(dport->regs.rcd_pcie_cap + offset)); 847 case 4: 848 return sysfs_emit(buf, "%#x\n", 849 readl(dport->regs.rcd_pcie_cap + offset)); 850 default: 851 return -EINVAL; 852 } 853 } 854 855 static ssize_t rcd_link_cap_show(struct device *dev, 856 struct device_attribute *attr, char *buf) 857 { 858 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCAP, buf, sizeof(u32)); 859 } 860 static DEVICE_ATTR_RO(rcd_link_cap); 861 862 static ssize_t rcd_link_ctrl_show(struct device *dev, 863 struct device_attribute *attr, char *buf) 864 { 865 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCTL, buf, sizeof(u16)); 866 } 867 static DEVICE_ATTR_RO(rcd_link_ctrl); 868 869 static ssize_t rcd_link_status_show(struct device *dev, 870 struct device_attribute *attr, char *buf) 871 { 872 return rcd_pcie_cap_emit(dev, PCI_EXP_LNKSTA, buf, sizeof(u16)); 873 } 874 static DEVICE_ATTR_RO(rcd_link_status); 875 876 static struct attribute *cxl_rcd_attrs[] = { 877 &dev_attr_rcd_link_cap.attr, 878 &dev_attr_rcd_link_ctrl.attr, 879 &dev_attr_rcd_link_status.attr, 880 NULL 881 }; 882 883 static umode_t cxl_rcd_visible(struct kobject *kobj, struct attribute *a, int n) 884 { 885 struct device *dev = kobj_to_dev(kobj); 886 struct pci_dev *pdev = to_pci_dev(dev); 887 888 if (is_cxl_restricted(pdev)) 889 return a->mode; 890 891 return 0; 892 } 893 894 static struct attribute_group cxl_rcd_group = { 895 .attrs = cxl_rcd_attrs, 896 .is_visible = cxl_rcd_visible, 897 }; 898 __ATTRIBUTE_GROUPS(cxl_rcd); 899 900 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 901 { 902 struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus); 903 struct cxl_memdev_state *mds; 904 struct cxl_dev_state *cxlds; 905 struct cxl_register_map map; 906 struct cxl_memdev *cxlmd; 907 int i, rc, pmu_count; 908 bool irq_avail; 909 910 /* 911 * Double check the anonymous union trickery in struct cxl_regs 912 * FIXME switch to struct_group() 913 */ 914 BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != 915 offsetof(struct cxl_regs, device_regs.memdev)); 916 917 rc = pcim_enable_device(pdev); 918 if (rc) 919 return rc; 920 pci_set_master(pdev); 921 922 mds = cxl_memdev_state_create(&pdev->dev); 923 if (IS_ERR(mds)) 924 return PTR_ERR(mds); 925 cxlds = &mds->cxlds; 926 pci_set_drvdata(pdev, cxlds); 927 928 cxlds->rcd = is_cxl_restricted(pdev); 929 cxlds->serial = pci_get_dsn(pdev); 930 cxlds->cxl_dvsec = pci_find_dvsec_capability( 931 pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); 932 if (!cxlds->cxl_dvsec) 933 dev_warn(&pdev->dev, 934 "Device DVSEC not present, skip CXL.mem init\n"); 935 936 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); 937 if (rc) 938 return rc; 939 940 rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs); 941 if (rc) 942 return rc; 943 944 /* 945 * If the component registers can't be found, the cxl_pci driver may 946 * still be useful for management functions so don't return an error. 947 */ 948 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, 949 &cxlds->reg_map); 950 if (rc) 951 dev_warn(&pdev->dev, "No component registers (%d)\n", rc); 952 else if (!cxlds->reg_map.component_map.ras.valid) 953 dev_dbg(&pdev->dev, "RAS registers not found\n"); 954 955 rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component, 956 BIT(CXL_CM_CAP_CAP_ID_RAS)); 957 if (rc) 958 dev_dbg(&pdev->dev, "Failed to map RAS capability.\n"); 959 960 rc = cxl_pci_type3_init_mailbox(cxlds); 961 if (rc) 962 return rc; 963 964 rc = cxl_await_media_ready(cxlds); 965 if (rc == 0) 966 cxlds->media_ready = true; 967 else 968 dev_warn(&pdev->dev, "Media not active (%d)\n", rc); 969 970 irq_avail = cxl_alloc_irq_vectors(pdev); 971 972 rc = cxl_pci_setup_mailbox(mds, irq_avail); 973 if (rc) 974 return rc; 975 976 rc = cxl_enumerate_cmds(mds); 977 if (rc) 978 return rc; 979 980 rc = cxl_set_timestamp(mds); 981 if (rc) 982 return rc; 983 984 rc = cxl_poison_state_init(mds); 985 if (rc) 986 return rc; 987 988 rc = cxl_dev_state_identify(mds); 989 if (rc) 990 return rc; 991 992 rc = cxl_mem_create_range_info(mds); 993 if (rc) 994 return rc; 995 996 cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); 997 if (IS_ERR(cxlmd)) 998 return PTR_ERR(cxlmd); 999 1000 rc = devm_cxl_setup_fw_upload(&pdev->dev, mds); 1001 if (rc) 1002 return rc; 1003 1004 rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd); 1005 if (rc) 1006 return rc; 1007 1008 pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU); 1009 for (i = 0; i < pmu_count; i++) { 1010 struct cxl_pmu_regs pmu_regs; 1011 1012 rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i); 1013 if (rc) { 1014 dev_dbg(&pdev->dev, "Could not find PMU regblock\n"); 1015 break; 1016 } 1017 1018 rc = cxl_map_pmu_regs(&map, &pmu_regs); 1019 if (rc) { 1020 dev_dbg(&pdev->dev, "Could not map PMU regs\n"); 1021 break; 1022 } 1023 1024 rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV); 1025 if (rc) { 1026 dev_dbg(&pdev->dev, "Could not add PMU instance\n"); 1027 break; 1028 } 1029 } 1030 1031 rc = cxl_event_config(host_bridge, mds, irq_avail); 1032 if (rc) 1033 return rc; 1034 1035 rc = cxl_pci_ras_unmask(pdev); 1036 if (rc) 1037 dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); 1038 1039 pci_save_state(pdev); 1040 1041 return rc; 1042 } 1043 1044 static const struct pci_device_id cxl_mem_pci_tbl[] = { 1045 /* PCI class code for CXL.mem Type-3 Devices */ 1046 { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)}, 1047 { /* terminate list */ }, 1048 }; 1049 MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl); 1050 1051 static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev) 1052 { 1053 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1054 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1055 struct device *dev = &cxlmd->dev; 1056 1057 dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n", 1058 dev_name(dev)); 1059 pci_restore_state(pdev); 1060 if (device_attach(dev) <= 0) 1061 return PCI_ERS_RESULT_DISCONNECT; 1062 return PCI_ERS_RESULT_RECOVERED; 1063 } 1064 1065 static void cxl_error_resume(struct pci_dev *pdev) 1066 { 1067 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1068 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1069 struct device *dev = &cxlmd->dev; 1070 1071 dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev), 1072 dev->driver ? "successful" : "failed"); 1073 } 1074 1075 static void cxl_reset_done(struct pci_dev *pdev) 1076 { 1077 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 1078 struct cxl_memdev *cxlmd = cxlds->cxlmd; 1079 struct device *dev = &pdev->dev; 1080 1081 /* 1082 * FLR does not expect to touch the HDM decoders and related 1083 * registers. SBR, however, will wipe all device configurations. 1084 * Issue a warning if there was an active decoder before the reset 1085 * that no longer exists. 1086 */ 1087 guard(device)(&cxlmd->dev); 1088 if (cxlmd->endpoint && 1089 cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { 1090 dev_crit(dev, "SBR happened without memory regions removal.\n"); 1091 dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); 1092 add_taint(TAINT_USER, LOCKDEP_STILL_OK); 1093 } 1094 } 1095 1096 static const struct pci_error_handlers cxl_error_handlers = { 1097 .error_detected = cxl_error_detected, 1098 .slot_reset = cxl_slot_reset, 1099 .resume = cxl_error_resume, 1100 .cor_error_detected = cxl_cor_error_detected, 1101 .reset_done = cxl_reset_done, 1102 }; 1103 1104 static struct pci_driver cxl_pci_driver = { 1105 .name = KBUILD_MODNAME, 1106 .id_table = cxl_mem_pci_tbl, 1107 .probe = cxl_pci_probe, 1108 .err_handler = &cxl_error_handlers, 1109 .dev_groups = cxl_rcd_groups, 1110 .driver = { 1111 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1112 }, 1113 }; 1114 1115 #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0) 1116 static void cxl_handle_cper_event(enum cxl_event_type ev_type, 1117 struct cxl_cper_event_rec *rec) 1118 { 1119 struct cper_cxl_event_devid *device_id = &rec->hdr.device_id; 1120 struct pci_dev *pdev __free(pci_dev_put) = NULL; 1121 enum cxl_event_log_type log_type; 1122 struct cxl_dev_state *cxlds; 1123 unsigned int devfn; 1124 u32 hdr_flags; 1125 1126 pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type, 1127 device_id->segment_num, device_id->bus_num, 1128 device_id->device_num, device_id->func_num); 1129 1130 devfn = PCI_DEVFN(device_id->device_num, device_id->func_num); 1131 pdev = pci_get_domain_bus_and_slot(device_id->segment_num, 1132 device_id->bus_num, devfn); 1133 if (!pdev) 1134 return; 1135 1136 guard(device)(&pdev->dev); 1137 if (pdev->driver != &cxl_pci_driver) 1138 return; 1139 1140 cxlds = pci_get_drvdata(pdev); 1141 if (!cxlds) 1142 return; 1143 1144 /* Fabricate a log type */ 1145 hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags); 1146 log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags); 1147 1148 cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type, 1149 &uuid_null, &rec->event); 1150 } 1151 1152 static void cxl_cper_work_fn(struct work_struct *work) 1153 { 1154 struct cxl_cper_work_data wd; 1155 1156 while (cxl_cper_kfifo_get(&wd)) 1157 cxl_handle_cper_event(wd.event_type, &wd.rec); 1158 } 1159 static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); 1160 1161 static int __init cxl_pci_driver_init(void) 1162 { 1163 int rc; 1164 1165 rc = pci_register_driver(&cxl_pci_driver); 1166 if (rc) 1167 return rc; 1168 1169 rc = cxl_cper_register_work(&cxl_cper_work); 1170 if (rc) 1171 pci_unregister_driver(&cxl_pci_driver); 1172 1173 return rc; 1174 } 1175 1176 static void __exit cxl_pci_driver_exit(void) 1177 { 1178 cxl_cper_unregister_work(&cxl_cper_work); 1179 cancel_work_sync(&cxl_cper_work); 1180 pci_unregister_driver(&cxl_pci_driver); 1181 } 1182 1183 module_init(cxl_pci_driver_init); 1184 module_exit(cxl_pci_driver_exit); 1185 MODULE_DESCRIPTION("CXL: PCI manageability"); 1186 MODULE_LICENSE("GPL v2"); 1187 MODULE_IMPORT_NS("CXL"); 1188