1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ 3 #include <linux/unaligned.h> 4 #include <linux/io-64-nonatomic-lo-hi.h> 5 #include <linux/moduleparam.h> 6 #include <linux/module.h> 7 #include <linux/delay.h> 8 #include <linux/sizes.h> 9 #include <linux/mutex.h> 10 #include <linux/list.h> 11 #include <linux/pci.h> 12 #include <linux/aer.h> 13 #include <linux/io.h> 14 #include <cxl/mailbox.h> 15 #include "cxlmem.h" 16 #include "cxlpci.h" 17 #include "cxl.h" 18 #include "pmu.h" 19 20 /** 21 * DOC: cxl pci 22 * 23 * This implements the PCI exclusive functionality for a CXL device as it is 24 * defined by the Compute Express Link specification. CXL devices may surface 25 * certain functionality even if it isn't CXL enabled. While this driver is 26 * focused around the PCI specific aspects of a CXL device, it binds to the 27 * specific CXL memory device class code, and therefore the implementation of 28 * cxl_pci is focused around CXL memory devices. 29 * 30 * The driver has several responsibilities, mainly: 31 * - Create the memX device and register on the CXL bus. 32 * - Enumerate device's register interface and map them. 33 * - Registers nvdimm bridge device with cxl_core. 34 * - Registers a CXL mailbox with cxl_core. 35 */ 36 37 #define cxl_doorbell_busy(cxlds) \ 38 (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \ 39 CXLDEV_MBOX_CTRL_DOORBELL) 40 41 /* CXL 2.0 - 8.2.8.4 */ 42 #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ) 43 44 /* 45 * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to 46 * dictate how long to wait for the mailbox to become ready. The new 47 * field allows the device to tell software the amount of time to wait 48 * before mailbox ready. This field per the spec theoretically allows 49 * for up to 255 seconds. 255 seconds is unreasonably long, its longer 50 * than the maximum SATA port link recovery wait. Default to 60 seconds 51 * until someone builds a CXL device that needs more time in practice. 52 */ 53 static unsigned short mbox_ready_timeout = 60; 54 module_param(mbox_ready_timeout, ushort, 0644); 55 MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready"); 56 57 static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) 58 { 59 const unsigned long start = jiffies; 60 unsigned long end = start; 61 62 while (cxl_doorbell_busy(cxlds)) { 63 end = jiffies; 64 65 if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) { 66 /* Check again in case preempted before timeout test */ 67 if (!cxl_doorbell_busy(cxlds)) 68 break; 69 return -ETIMEDOUT; 70 } 71 cpu_relax(); 72 } 73 74 dev_dbg(cxlds->dev, "Doorbell wait took %dms", 75 jiffies_to_msecs(end) - jiffies_to_msecs(start)); 76 return 0; 77 } 78 79 #define cxl_err(dev, status, msg) \ 80 dev_err_ratelimited(dev, msg ", device state %s%s\n", \ 81 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 82 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 83 84 #define cxl_cmd_err(dev, cmd, status, msg) \ 85 dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \ 86 (cmd)->opcode, \ 87 status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ 88 status & CXLMDEV_FW_HALT ? " firmware-halt" : "") 89 90 /* 91 * Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique 92 * wrapper object for each irq within the same cxlds. 93 */ 94 struct cxl_dev_id { 95 struct cxl_dev_state *cxlds; 96 }; 97 98 static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq, 99 irq_handler_t thread_fn) 100 { 101 struct device *dev = cxlds->dev; 102 struct cxl_dev_id *dev_id; 103 104 dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL); 105 if (!dev_id) 106 return -ENOMEM; 107 dev_id->cxlds = cxlds; 108 109 return devm_request_threaded_irq(dev, irq, NULL, thread_fn, 110 IRQF_SHARED | IRQF_ONESHOT, NULL, 111 dev_id); 112 } 113 114 static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds) 115 { 116 u64 reg; 117 118 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 119 return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100; 120 } 121 122 static irqreturn_t cxl_pci_mbox_irq(int irq, void *id) 123 { 124 u64 reg; 125 u16 opcode; 126 struct cxl_dev_id *dev_id = id; 127 struct cxl_dev_state *cxlds = dev_id->cxlds; 128 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 129 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 130 131 if (!cxl_mbox_background_complete(cxlds)) 132 return IRQ_NONE; 133 134 reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 135 opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg); 136 if (opcode == CXL_MBOX_OP_SANITIZE) { 137 mutex_lock(&cxl_mbox->mbox_mutex); 138 if (mds->security.sanitize_node) 139 mod_delayed_work(system_wq, &mds->security.poll_dwork, 0); 140 mutex_unlock(&cxl_mbox->mbox_mutex); 141 } else { 142 /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */ 143 rcuwait_wake_up(&cxl_mbox->mbox_wait); 144 } 145 146 return IRQ_HANDLED; 147 } 148 149 /* 150 * Sanitization operation polling mode. 151 */ 152 static void cxl_mbox_sanitize_work(struct work_struct *work) 153 { 154 struct cxl_memdev_state *mds = 155 container_of(work, typeof(*mds), security.poll_dwork.work); 156 struct cxl_dev_state *cxlds = &mds->cxlds; 157 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 158 159 mutex_lock(&cxl_mbox->mbox_mutex); 160 if (cxl_mbox_background_complete(cxlds)) { 161 mds->security.poll_tmo_secs = 0; 162 if (mds->security.sanitize_node) 163 sysfs_notify_dirent(mds->security.sanitize_node); 164 mds->security.sanitize_active = false; 165 166 dev_dbg(cxlds->dev, "Sanitization operation ended\n"); 167 } else { 168 int timeout = mds->security.poll_tmo_secs + 10; 169 170 mds->security.poll_tmo_secs = min(15 * 60, timeout); 171 schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ); 172 } 173 mutex_unlock(&cxl_mbox->mbox_mutex); 174 } 175 176 /** 177 * __cxl_pci_mbox_send_cmd() - Execute a mailbox command 178 * @cxl_mbox: CXL mailbox context 179 * @mbox_cmd: Command to send to the memory device. 180 * 181 * Context: Any context. Expects mbox_mutex to be held. 182 * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success. 183 * Caller should check the return code in @mbox_cmd to make sure it 184 * succeeded. 185 * 186 * This is a generic form of the CXL mailbox send command thus only using the 187 * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory 188 * devices, and perhaps other types of CXL devices may have further information 189 * available upon error conditions. Driver facilities wishing to send mailbox 190 * commands should use the wrapper command. 191 * 192 * The CXL spec allows for up to two mailboxes. The intention is for the primary 193 * mailbox to be OS controlled and the secondary mailbox to be used by system 194 * firmware. This allows the OS and firmware to communicate with the device and 195 * not need to coordinate with each other. The driver only uses the primary 196 * mailbox. 197 */ 198 static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox, 199 struct cxl_mbox_cmd *mbox_cmd) 200 { 201 struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox); 202 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 203 void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET; 204 struct device *dev = cxlds->dev; 205 u64 cmd_reg, status_reg; 206 size_t out_len; 207 int rc; 208 209 lockdep_assert_held(&cxl_mbox->mbox_mutex); 210 211 /* 212 * Here are the steps from 8.2.8.4 of the CXL 2.0 spec. 213 * 1. Caller reads MB Control Register to verify doorbell is clear 214 * 2. Caller writes Command Register 215 * 3. Caller writes Command Payload Registers if input payload is non-empty 216 * 4. Caller writes MB Control Register to set doorbell 217 * 5. Caller either polls for doorbell to be clear or waits for interrupt if configured 218 * 6. Caller reads MB Status Register to fetch Return code 219 * 7. If command successful, Caller reads Command Register to get Payload Length 220 * 8. If output payload is non-empty, host reads Command Payload Registers 221 * 222 * Hardware is free to do whatever it wants before the doorbell is rung, 223 * and isn't allowed to change anything after it clears the doorbell. As 224 * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can 225 * also happen in any order (though some orders might not make sense). 226 */ 227 228 /* #1 */ 229 if (cxl_doorbell_busy(cxlds)) { 230 u64 md_status = 231 readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 232 233 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, 234 "mailbox queue busy"); 235 return -EBUSY; 236 } 237 238 /* 239 * With sanitize polling, hardware might be done and the poller still 240 * not be in sync. Ensure no new command comes in until so. Keep the 241 * hardware semantics and only allow device health status. 242 */ 243 if (mds->security.poll_tmo_secs > 0) { 244 if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO) 245 return -EBUSY; 246 } 247 248 cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK, 249 mbox_cmd->opcode); 250 if (mbox_cmd->size_in) { 251 if (WARN_ON(!mbox_cmd->payload_in)) 252 return -EINVAL; 253 254 cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, 255 mbox_cmd->size_in); 256 memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in); 257 } 258 259 /* #2, #3 */ 260 writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 261 262 /* #4 */ 263 dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode); 264 writel(CXLDEV_MBOX_CTRL_DOORBELL, 265 cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 266 267 /* #5 */ 268 rc = cxl_pci_mbox_wait_for_doorbell(cxlds); 269 if (rc == -ETIMEDOUT) { 270 u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 271 272 cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout"); 273 return rc; 274 } 275 276 /* #6 */ 277 status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET); 278 mbox_cmd->return_code = 279 FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg); 280 281 /* 282 * Handle the background command in a synchronous manner. 283 * 284 * All other mailbox commands will serialize/queue on the mbox_mutex, 285 * which we currently hold. Furthermore this also guarantees that 286 * cxl_mbox_background_complete() checks are safe amongst each other, 287 * in that no new bg operation can occur in between. 288 * 289 * Background operations are timesliced in accordance with the nature 290 * of the command. In the event of timeout, the mailbox state is 291 * indeterminate until the next successful command submission and the 292 * driver can get back in sync with the hardware state. 293 */ 294 if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) { 295 u64 bg_status_reg; 296 int i, timeout; 297 298 /* 299 * Sanitization is a special case which monopolizes the device 300 * and cannot be timesliced. Handle asynchronously instead, 301 * and allow userspace to poll(2) for completion. 302 */ 303 if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) { 304 if (mds->security.sanitize_active) 305 return -EBUSY; 306 307 /* give first timeout a second */ 308 timeout = 1; 309 mds->security.poll_tmo_secs = timeout; 310 mds->security.sanitize_active = true; 311 schedule_delayed_work(&mds->security.poll_dwork, 312 timeout * HZ); 313 dev_dbg(dev, "Sanitization operation started\n"); 314 goto success; 315 } 316 317 dev_dbg(dev, "Mailbox background operation (0x%04x) started\n", 318 mbox_cmd->opcode); 319 320 timeout = mbox_cmd->poll_interval_ms; 321 for (i = 0; i < mbox_cmd->poll_count; i++) { 322 if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait, 323 cxl_mbox_background_complete(cxlds), 324 TASK_UNINTERRUPTIBLE, 325 msecs_to_jiffies(timeout)) > 0) 326 break; 327 } 328 329 if (!cxl_mbox_background_complete(cxlds)) { 330 dev_err(dev, "timeout waiting for background (%d ms)\n", 331 timeout * mbox_cmd->poll_count); 332 return -ETIMEDOUT; 333 } 334 335 bg_status_reg = readq(cxlds->regs.mbox + 336 CXLDEV_MBOX_BG_CMD_STATUS_OFFSET); 337 mbox_cmd->return_code = 338 FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK, 339 bg_status_reg); 340 dev_dbg(dev, 341 "Mailbox background operation (0x%04x) completed\n", 342 mbox_cmd->opcode); 343 } 344 345 if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) { 346 dev_dbg(dev, "Mailbox operation had an error: %s\n", 347 cxl_mbox_cmd_rc2str(mbox_cmd)); 348 return 0; /* completed but caller must check return_code */ 349 } 350 351 success: 352 /* #7 */ 353 cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET); 354 out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg); 355 356 /* #8 */ 357 if (out_len && mbox_cmd->payload_out) { 358 /* 359 * Sanitize the copy. If hardware misbehaves, out_len per the 360 * spec can actually be greater than the max allowed size (21 361 * bits available but spec defined 1M max). The caller also may 362 * have requested less data than the hardware supplied even 363 * within spec. 364 */ 365 size_t n; 366 367 n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len); 368 memcpy_fromio(mbox_cmd->payload_out, payload, n); 369 mbox_cmd->size_out = n; 370 } else { 371 mbox_cmd->size_out = 0; 372 } 373 374 return 0; 375 } 376 377 static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox, 378 struct cxl_mbox_cmd *cmd) 379 { 380 int rc; 381 382 mutex_lock_io(&cxl_mbox->mbox_mutex); 383 rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd); 384 mutex_unlock(&cxl_mbox->mbox_mutex); 385 386 return rc; 387 } 388 389 static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail) 390 { 391 struct cxl_dev_state *cxlds = &mds->cxlds; 392 struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; 393 const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET); 394 struct device *dev = cxlds->dev; 395 unsigned long timeout; 396 int irq, msgnum; 397 u64 md_status; 398 u32 ctrl; 399 400 timeout = jiffies + mbox_ready_timeout * HZ; 401 do { 402 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); 403 if (md_status & CXLMDEV_MBOX_IF_READY) 404 break; 405 if (msleep_interruptible(100)) 406 break; 407 } while (!time_after(jiffies, timeout)); 408 409 if (!(md_status & CXLMDEV_MBOX_IF_READY)) { 410 cxl_err(dev, md_status, "timeout awaiting mailbox ready"); 411 return -ETIMEDOUT; 412 } 413 414 /* 415 * A command may be in flight from a previous driver instance, 416 * think kexec, do one doorbell wait so that 417 * __cxl_pci_mbox_send_cmd() can assume that it is the only 418 * source for future doorbell busy events. 419 */ 420 if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) { 421 cxl_err(dev, md_status, "timeout awaiting mailbox idle"); 422 return -ETIMEDOUT; 423 } 424 425 cxl_mbox->mbox_send = cxl_pci_mbox_send; 426 cxl_mbox->payload_size = 427 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap); 428 429 /* 430 * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register 431 * 432 * If the size is too small, mandatory commands will not work and so 433 * there's no point in going forward. If the size is too large, there's 434 * no harm is soft limiting it. 435 */ 436 cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M); 437 if (cxl_mbox->payload_size < 256) { 438 dev_err(dev, "Mailbox is too small (%zub)", 439 cxl_mbox->payload_size); 440 return -ENXIO; 441 } 442 443 dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size); 444 445 INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work); 446 447 /* background command interrupts are optional */ 448 if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail) 449 return 0; 450 451 msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap); 452 irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum); 453 if (irq < 0) 454 return 0; 455 456 if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq)) 457 return 0; 458 459 dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n"); 460 /* enable background command mbox irq support */ 461 ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 462 ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ; 463 writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET); 464 465 return 0; 466 } 467 468 /* 469 * Assume that any RCIEP that emits the CXL memory expander class code 470 * is an RCD 471 */ 472 static bool is_cxl_restricted(struct pci_dev *pdev) 473 { 474 return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END; 475 } 476 477 static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev, 478 struct cxl_register_map *map) 479 { 480 struct cxl_dport *dport; 481 resource_size_t component_reg_phys; 482 483 *map = (struct cxl_register_map) { 484 .host = &pdev->dev, 485 .resource = CXL_RESOURCE_NONE, 486 }; 487 488 struct cxl_port *port __free(put_cxl_port) = 489 cxl_pci_find_port(pdev, &dport); 490 if (!port) 491 return -EPROBE_DEFER; 492 493 component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport); 494 if (component_reg_phys == CXL_RESOURCE_NONE) 495 return -ENXIO; 496 497 map->resource = component_reg_phys; 498 map->reg_type = CXL_REGLOC_RBI_COMPONENT; 499 map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; 500 501 return 0; 502 } 503 504 static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, 505 struct cxl_register_map *map) 506 { 507 int rc; 508 509 rc = cxl_find_regblock(pdev, type, map); 510 511 /* 512 * If the Register Locator DVSEC does not exist, check if it 513 * is an RCH and try to extract the Component Registers from 514 * an RCRB. 515 */ 516 if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) 517 rc = cxl_rcrb_get_comp_regs(pdev, map); 518 519 if (rc) 520 return rc; 521 522 return cxl_setup_regs(map); 523 } 524 525 static int cxl_pci_ras_unmask(struct pci_dev *pdev) 526 { 527 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 528 void __iomem *addr; 529 u32 orig_val, val, mask; 530 u16 cap; 531 int rc; 532 533 if (!cxlds->regs.ras) { 534 dev_dbg(&pdev->dev, "No RAS registers.\n"); 535 return 0; 536 } 537 538 /* BIOS has PCIe AER error control */ 539 if (!pcie_aer_is_native(pdev)) 540 return 0; 541 542 rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap); 543 if (rc) 544 return rc; 545 546 if (cap & PCI_EXP_DEVCTL_URRE) { 547 addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET; 548 orig_val = readl(addr); 549 550 mask = CXL_RAS_UNCORRECTABLE_MASK_MASK | 551 CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK; 552 val = orig_val & ~mask; 553 writel(val, addr); 554 dev_dbg(&pdev->dev, 555 "Uncorrectable RAS Errors Mask: %#x -> %#x\n", 556 orig_val, val); 557 } 558 559 if (cap & PCI_EXP_DEVCTL_CERE) { 560 addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET; 561 orig_val = readl(addr); 562 val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK; 563 writel(val, addr); 564 dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n", 565 orig_val, val); 566 } 567 568 return 0; 569 } 570 571 static void free_event_buf(void *buf) 572 { 573 kvfree(buf); 574 } 575 576 /* 577 * There is a single buffer for reading event logs from the mailbox. All logs 578 * share this buffer protected by the mds->event_log_lock. 579 */ 580 static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds) 581 { 582 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 583 struct cxl_get_event_payload *buf; 584 585 buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); 586 if (!buf) 587 return -ENOMEM; 588 mds->event.buf = buf; 589 590 return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf); 591 } 592 593 static bool cxl_alloc_irq_vectors(struct pci_dev *pdev) 594 { 595 int nvecs; 596 597 /* 598 * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must 599 * not generate INTx messages if that function participates in 600 * CXL.cache or CXL.mem. 601 * 602 * Additionally pci_alloc_irq_vectors() handles calling 603 * pci_free_irq_vectors() automatically despite not being called 604 * pcim_*. See pci_setup_msi_context(). 605 */ 606 nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS, 607 PCI_IRQ_MSIX | PCI_IRQ_MSI); 608 if (nvecs < 1) { 609 dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs); 610 return false; 611 } 612 return true; 613 } 614 615 static irqreturn_t cxl_event_thread(int irq, void *id) 616 { 617 struct cxl_dev_id *dev_id = id; 618 struct cxl_dev_state *cxlds = dev_id->cxlds; 619 struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); 620 u32 status; 621 622 do { 623 /* 624 * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status; 625 * ignore the reserved upper 32 bits 626 */ 627 status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET); 628 /* Ignore logs unknown to the driver */ 629 status &= CXLDEV_EVENT_STATUS_ALL; 630 if (!status) 631 break; 632 cxl_mem_get_event_records(mds, status); 633 cond_resched(); 634 } while (status); 635 636 return IRQ_HANDLED; 637 } 638 639 static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting) 640 { 641 struct pci_dev *pdev = to_pci_dev(cxlds->dev); 642 int irq; 643 644 if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX) 645 return -ENXIO; 646 647 irq = pci_irq_vector(pdev, 648 FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting)); 649 if (irq < 0) 650 return irq; 651 652 return cxl_request_irq(cxlds, irq, cxl_event_thread); 653 } 654 655 static int cxl_event_get_int_policy(struct cxl_memdev_state *mds, 656 struct cxl_event_interrupt_policy *policy) 657 { 658 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 659 struct cxl_mbox_cmd mbox_cmd = { 660 .opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY, 661 .payload_out = policy, 662 .size_out = sizeof(*policy), 663 }; 664 int rc; 665 666 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 667 if (rc < 0) 668 dev_err(mds->cxlds.dev, 669 "Failed to get event interrupt policy : %d", rc); 670 671 return rc; 672 } 673 674 static int cxl_event_config_msgnums(struct cxl_memdev_state *mds, 675 struct cxl_event_interrupt_policy *policy) 676 { 677 struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; 678 struct cxl_mbox_cmd mbox_cmd; 679 int rc; 680 681 *policy = (struct cxl_event_interrupt_policy) { 682 .info_settings = CXL_INT_MSI_MSIX, 683 .warn_settings = CXL_INT_MSI_MSIX, 684 .failure_settings = CXL_INT_MSI_MSIX, 685 .fatal_settings = CXL_INT_MSI_MSIX, 686 }; 687 688 mbox_cmd = (struct cxl_mbox_cmd) { 689 .opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY, 690 .payload_in = policy, 691 .size_in = sizeof(*policy), 692 }; 693 694 rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); 695 if (rc < 0) { 696 dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d", 697 rc); 698 return rc; 699 } 700 701 /* Retrieve final interrupt settings */ 702 return cxl_event_get_int_policy(mds, policy); 703 } 704 705 static int cxl_event_irqsetup(struct cxl_memdev_state *mds) 706 { 707 struct cxl_dev_state *cxlds = &mds->cxlds; 708 struct cxl_event_interrupt_policy policy; 709 int rc; 710 711 rc = cxl_event_config_msgnums(mds, &policy); 712 if (rc) 713 return rc; 714 715 rc = cxl_event_req_irq(cxlds, policy.info_settings); 716 if (rc) { 717 dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n"); 718 return rc; 719 } 720 721 rc = cxl_event_req_irq(cxlds, policy.warn_settings); 722 if (rc) { 723 dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n"); 724 return rc; 725 } 726 727 rc = cxl_event_req_irq(cxlds, policy.failure_settings); 728 if (rc) { 729 dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n"); 730 return rc; 731 } 732 733 rc = cxl_event_req_irq(cxlds, policy.fatal_settings); 734 if (rc) { 735 dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n"); 736 return rc; 737 } 738 739 return 0; 740 } 741 742 static bool cxl_event_int_is_fw(u8 setting) 743 { 744 u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting); 745 746 return mode == CXL_INT_FW; 747 } 748 749 static int cxl_event_config(struct pci_host_bridge *host_bridge, 750 struct cxl_memdev_state *mds, bool irq_avail) 751 { 752 struct cxl_event_interrupt_policy policy; 753 int rc; 754 755 /* 756 * When BIOS maintains CXL error reporting control, it will process 757 * event records. Only one agent can do so. 758 */ 759 if (!host_bridge->native_cxl_error) 760 return 0; 761 762 if (!irq_avail) { 763 dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n"); 764 return 0; 765 } 766 767 rc = cxl_mem_alloc_event_buf(mds); 768 if (rc) 769 return rc; 770 771 rc = cxl_event_get_int_policy(mds, &policy); 772 if (rc) 773 return rc; 774 775 if (cxl_event_int_is_fw(policy.info_settings) || 776 cxl_event_int_is_fw(policy.warn_settings) || 777 cxl_event_int_is_fw(policy.failure_settings) || 778 cxl_event_int_is_fw(policy.fatal_settings)) { 779 dev_err(mds->cxlds.dev, 780 "FW still in control of Event Logs despite _OSC settings\n"); 781 return -EBUSY; 782 } 783 784 rc = cxl_event_irqsetup(mds); 785 if (rc) 786 return rc; 787 788 cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL); 789 790 return 0; 791 } 792 793 static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds) 794 { 795 int rc; 796 797 /* 798 * Fail the init if there's no mailbox. For a type3 this is out of spec. 799 */ 800 if (!cxlds->reg_map.device_map.mbox.valid) 801 return -ENODEV; 802 803 rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev); 804 if (rc) 805 return rc; 806 807 return 0; 808 } 809 810 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 811 { 812 struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus); 813 struct cxl_memdev_state *mds; 814 struct cxl_dev_state *cxlds; 815 struct cxl_register_map map; 816 struct cxl_memdev *cxlmd; 817 int i, rc, pmu_count; 818 bool irq_avail; 819 820 /* 821 * Double check the anonymous union trickery in struct cxl_regs 822 * FIXME switch to struct_group() 823 */ 824 BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != 825 offsetof(struct cxl_regs, device_regs.memdev)); 826 827 rc = pcim_enable_device(pdev); 828 if (rc) 829 return rc; 830 pci_set_master(pdev); 831 832 mds = cxl_memdev_state_create(&pdev->dev); 833 if (IS_ERR(mds)) 834 return PTR_ERR(mds); 835 cxlds = &mds->cxlds; 836 pci_set_drvdata(pdev, cxlds); 837 838 cxlds->rcd = is_cxl_restricted(pdev); 839 cxlds->serial = pci_get_dsn(pdev); 840 cxlds->cxl_dvsec = pci_find_dvsec_capability( 841 pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); 842 if (!cxlds->cxl_dvsec) 843 dev_warn(&pdev->dev, 844 "Device DVSEC not present, skip CXL.mem init\n"); 845 846 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); 847 if (rc) 848 return rc; 849 850 rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs); 851 if (rc) 852 return rc; 853 854 /* 855 * If the component registers can't be found, the cxl_pci driver may 856 * still be useful for management functions so don't return an error. 857 */ 858 rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, 859 &cxlds->reg_map); 860 if (rc) 861 dev_warn(&pdev->dev, "No component registers (%d)\n", rc); 862 else if (!cxlds->reg_map.component_map.ras.valid) 863 dev_dbg(&pdev->dev, "RAS registers not found\n"); 864 865 rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component, 866 BIT(CXL_CM_CAP_CAP_ID_RAS)); 867 if (rc) 868 dev_dbg(&pdev->dev, "Failed to map RAS capability.\n"); 869 870 rc = cxl_pci_type3_init_mailbox(cxlds); 871 if (rc) 872 return rc; 873 874 rc = cxl_await_media_ready(cxlds); 875 if (rc == 0) 876 cxlds->media_ready = true; 877 else 878 dev_warn(&pdev->dev, "Media not active (%d)\n", rc); 879 880 irq_avail = cxl_alloc_irq_vectors(pdev); 881 882 rc = cxl_pci_setup_mailbox(mds, irq_avail); 883 if (rc) 884 return rc; 885 886 rc = cxl_enumerate_cmds(mds); 887 if (rc) 888 return rc; 889 890 rc = cxl_set_timestamp(mds); 891 if (rc) 892 return rc; 893 894 rc = cxl_poison_state_init(mds); 895 if (rc) 896 return rc; 897 898 rc = cxl_dev_state_identify(mds); 899 if (rc) 900 return rc; 901 902 rc = cxl_mem_create_range_info(mds); 903 if (rc) 904 return rc; 905 906 cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); 907 if (IS_ERR(cxlmd)) 908 return PTR_ERR(cxlmd); 909 910 rc = devm_cxl_setup_fw_upload(&pdev->dev, mds); 911 if (rc) 912 return rc; 913 914 rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd); 915 if (rc) 916 return rc; 917 918 pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU); 919 for (i = 0; i < pmu_count; i++) { 920 struct cxl_pmu_regs pmu_regs; 921 922 rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i); 923 if (rc) { 924 dev_dbg(&pdev->dev, "Could not find PMU regblock\n"); 925 break; 926 } 927 928 rc = cxl_map_pmu_regs(&map, &pmu_regs); 929 if (rc) { 930 dev_dbg(&pdev->dev, "Could not map PMU regs\n"); 931 break; 932 } 933 934 rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV); 935 if (rc) { 936 dev_dbg(&pdev->dev, "Could not add PMU instance\n"); 937 break; 938 } 939 } 940 941 rc = cxl_event_config(host_bridge, mds, irq_avail); 942 if (rc) 943 return rc; 944 945 rc = cxl_pci_ras_unmask(pdev); 946 if (rc) 947 dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); 948 949 pci_save_state(pdev); 950 951 return rc; 952 } 953 954 static const struct pci_device_id cxl_mem_pci_tbl[] = { 955 /* PCI class code for CXL.mem Type-3 Devices */ 956 { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)}, 957 { /* terminate list */ }, 958 }; 959 MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl); 960 961 static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev) 962 { 963 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 964 struct cxl_memdev *cxlmd = cxlds->cxlmd; 965 struct device *dev = &cxlmd->dev; 966 967 dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n", 968 dev_name(dev)); 969 pci_restore_state(pdev); 970 if (device_attach(dev) <= 0) 971 return PCI_ERS_RESULT_DISCONNECT; 972 return PCI_ERS_RESULT_RECOVERED; 973 } 974 975 static void cxl_error_resume(struct pci_dev *pdev) 976 { 977 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 978 struct cxl_memdev *cxlmd = cxlds->cxlmd; 979 struct device *dev = &cxlmd->dev; 980 981 dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev), 982 dev->driver ? "successful" : "failed"); 983 } 984 985 static void cxl_reset_done(struct pci_dev *pdev) 986 { 987 struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); 988 struct cxl_memdev *cxlmd = cxlds->cxlmd; 989 struct device *dev = &pdev->dev; 990 991 /* 992 * FLR does not expect to touch the HDM decoders and related 993 * registers. SBR, however, will wipe all device configurations. 994 * Issue a warning if there was an active decoder before the reset 995 * that no longer exists. 996 */ 997 guard(device)(&cxlmd->dev); 998 if (cxlmd->endpoint && 999 cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) { 1000 dev_crit(dev, "SBR happened without memory regions removal.\n"); 1001 dev_crit(dev, "System may be unstable if regions hosted system memory.\n"); 1002 add_taint(TAINT_USER, LOCKDEP_STILL_OK); 1003 } 1004 } 1005 1006 static const struct pci_error_handlers cxl_error_handlers = { 1007 .error_detected = cxl_error_detected, 1008 .slot_reset = cxl_slot_reset, 1009 .resume = cxl_error_resume, 1010 .cor_error_detected = cxl_cor_error_detected, 1011 .reset_done = cxl_reset_done, 1012 }; 1013 1014 static struct pci_driver cxl_pci_driver = { 1015 .name = KBUILD_MODNAME, 1016 .id_table = cxl_mem_pci_tbl, 1017 .probe = cxl_pci_probe, 1018 .err_handler = &cxl_error_handlers, 1019 .driver = { 1020 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 1021 }, 1022 }; 1023 1024 #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0) 1025 static void cxl_handle_cper_event(enum cxl_event_type ev_type, 1026 struct cxl_cper_event_rec *rec) 1027 { 1028 struct cper_cxl_event_devid *device_id = &rec->hdr.device_id; 1029 struct pci_dev *pdev __free(pci_dev_put) = NULL; 1030 enum cxl_event_log_type log_type; 1031 struct cxl_dev_state *cxlds; 1032 unsigned int devfn; 1033 u32 hdr_flags; 1034 1035 pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type, 1036 device_id->segment_num, device_id->bus_num, 1037 device_id->device_num, device_id->func_num); 1038 1039 devfn = PCI_DEVFN(device_id->device_num, device_id->func_num); 1040 pdev = pci_get_domain_bus_and_slot(device_id->segment_num, 1041 device_id->bus_num, devfn); 1042 if (!pdev) 1043 return; 1044 1045 guard(device)(&pdev->dev); 1046 if (pdev->driver != &cxl_pci_driver) 1047 return; 1048 1049 cxlds = pci_get_drvdata(pdev); 1050 if (!cxlds) 1051 return; 1052 1053 /* Fabricate a log type */ 1054 hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags); 1055 log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags); 1056 1057 cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type, 1058 &uuid_null, &rec->event); 1059 } 1060 1061 static void cxl_cper_work_fn(struct work_struct *work) 1062 { 1063 struct cxl_cper_work_data wd; 1064 1065 while (cxl_cper_kfifo_get(&wd)) 1066 cxl_handle_cper_event(wd.event_type, &wd.rec); 1067 } 1068 static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn); 1069 1070 static int __init cxl_pci_driver_init(void) 1071 { 1072 int rc; 1073 1074 rc = pci_register_driver(&cxl_pci_driver); 1075 if (rc) 1076 return rc; 1077 1078 rc = cxl_cper_register_work(&cxl_cper_work); 1079 if (rc) 1080 pci_unregister_driver(&cxl_pci_driver); 1081 1082 return rc; 1083 } 1084 1085 static void __exit cxl_pci_driver_exit(void) 1086 { 1087 cxl_cper_unregister_work(&cxl_cper_work); 1088 cancel_work_sync(&cxl_cper_work); 1089 pci_unregister_driver(&cxl_pci_driver); 1090 } 1091 1092 module_init(cxl_pci_driver_init); 1093 module_exit(cxl_pci_driver_exit); 1094 MODULE_DESCRIPTION("CXL: PCI manageability"); 1095 MODULE_LICENSE("GPL v2"); 1096 MODULE_IMPORT_NS(CXL); 1097