1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/bitfield.h> 7 #include <linux/sizes.h> 8 #include <linux/time64.h> 9 #include <linux/vfio_pci_core.h> 10 #include <linux/delay.h> 11 #include <linux/jiffies.h> 12 #include <linux/sched.h> 13 #include <linux/pci-p2pdma.h> 14 #include <linux/pm_runtime.h> 15 #include <linux/memory-failure.h> 16 17 /* 18 * The device memory usable to the workloads running in the VM is cached 19 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 20 * to the VM and is represented as usemem. 21 * Moreover, the VM GPU device driver needs a non-cacheable region to 22 * support the MIG feature. This region is also exposed as a 64b BAR 23 * (comprising of BAR2 and BAR3 region) and represented as resmem. 24 */ 25 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 26 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 27 28 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 29 #define MEMBLK_SIZE SZ_512M 30 31 #define DVSEC_BITMAP_OFFSET 0xA 32 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 33 34 #define GPU_CAP_DVSEC_REGISTER 3 35 36 #define C2C_LINK_BAR0_OFFSET 0x1498 37 #define HBM_TRAINING_BAR0_OFFSET 0x200BC 38 #define STATUS_READY 0xFF 39 40 #define POLL_QUANTUM_MS 1000 41 #define POLL_TIMEOUT_MS (30 * 1000) 42 43 /* 44 * The state of the two device memory region - resmem and usemem - is 45 * saved as struct mem_region. 46 */ 47 struct mem_region { 48 phys_addr_t memphys; /* Base physical address of the region */ 49 size_t memlength; /* Region size */ 50 size_t bar_size; /* Reported region BAR size */ 51 __le64 bar_val; /* Emulated BAR offset registers */ 52 union { 53 void *memaddr; 54 void __iomem *ioaddr; 55 }; /* Base virtual address of the region */ 56 struct pfn_address_space pfn_address_space; 57 }; 58 59 struct nvgrace_gpu_pci_core_device { 60 struct vfio_pci_core_device core_device; 61 /* Cached and usable memory for the VM. */ 62 struct mem_region usemem; 63 /* Non cached memory carved out from the end of device memory */ 64 struct mem_region resmem; 65 /* Lock to control device memory kernel mapping */ 66 struct mutex remap_lock; 67 void __iomem *bar0_base; 68 bool has_mig_hw_bug; 69 /* GPU has just been reset */ 70 bool reset_done; 71 /* CXL Device DVSEC offset; 0 if not present (legacy GB path) */ 72 int cxl_dvsec; 73 }; 74 75 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 76 { 77 struct nvgrace_gpu_pci_core_device *nvdev = 78 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 79 core_device.vdev); 80 81 nvdev->resmem.bar_val = 0; 82 nvdev->usemem.bar_val = 0; 83 } 84 85 /* Choose the structure corresponding to the fake BAR with a given index. */ 86 static struct mem_region * 87 nvgrace_gpu_memregion(int index, 88 struct nvgrace_gpu_pci_core_device *nvdev) 89 { 90 if (index == USEMEM_REGION_INDEX) 91 return &nvdev->usemem; 92 93 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 94 return &nvdev->resmem; 95 96 return NULL; 97 } 98 99 static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev, 100 unsigned int index, 101 unsigned long pfn, 102 pgoff_t *pfn_offset_in_region) 103 { 104 struct mem_region *region; 105 unsigned long start_pfn, num_pages; 106 107 region = nvgrace_gpu_memregion(index, nvdev); 108 if (!region) 109 return -EINVAL; 110 111 start_pfn = PHYS_PFN(region->memphys); 112 num_pages = region->memlength >> PAGE_SHIFT; 113 114 if (pfn < start_pfn || pfn >= start_pfn + num_pages) 115 return -EFAULT; 116 117 *pfn_offset_in_region = pfn - start_pfn; 118 119 return 0; 120 } 121 122 static inline 123 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma); 124 125 static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma, 126 unsigned long pfn, 127 pgoff_t *pgoff) 128 { 129 struct nvgrace_gpu_pci_core_device *nvdev; 130 unsigned int index = 131 vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 132 pgoff_t vma_offset_in_region = vma->vm_pgoff & 133 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 134 pgoff_t pfn_offset_in_region; 135 int ret; 136 137 nvdev = vma_to_nvdev(vma); 138 if (!nvdev) 139 return -ENOENT; 140 141 ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region); 142 if (ret) 143 return ret; 144 145 /* Ensure PFN is not before VMA's start within the region */ 146 if (pfn_offset_in_region < vma_offset_in_region) 147 return -EFAULT; 148 149 /* Calculate offset from VMA start */ 150 *pgoff = vma->vm_pgoff + 151 (pfn_offset_in_region - vma_offset_in_region); 152 153 return 0; 154 } 155 156 static int 157 nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev, 158 struct mem_region *region) 159 { 160 unsigned long pfn, nr_pages; 161 162 pfn = PHYS_PFN(region->memphys); 163 nr_pages = region->memlength >> PAGE_SHIFT; 164 165 region->pfn_address_space.node.start = pfn; 166 region->pfn_address_space.node.last = pfn + nr_pages - 1; 167 region->pfn_address_space.mapping = core_vdev->inode->i_mapping; 168 region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff; 169 170 return register_pfn_address_space(®ion->pfn_address_space); 171 } 172 173 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 174 { 175 struct vfio_pci_core_device *vdev = 176 container_of(core_vdev, struct vfio_pci_core_device, vdev); 177 struct nvgrace_gpu_pci_core_device *nvdev = 178 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 179 core_device.vdev); 180 void __iomem *io; 181 int ret; 182 183 ret = vfio_pci_core_enable(vdev); 184 if (ret) 185 return ret; 186 187 if (nvdev->usemem.memlength) { 188 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 189 mutex_init(&nvdev->remap_lock); 190 } 191 192 /* 193 * GPU readiness is checked by reading the BAR0 registers. 194 * The BAR map was just set up by vfio_pci_core_enable(), so 195 * bail early if that wasn't successful: 196 */ 197 io = vfio_pci_core_get_iomap(vdev, 0); 198 if (IS_ERR(io)) { 199 ret = PTR_ERR(io); 200 goto error_exit; 201 } 202 203 if (nvdev->resmem.memlength) { 204 ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem); 205 if (ret && ret != -EOPNOTSUPP) 206 goto error_exit; 207 } 208 209 ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem); 210 if (ret && ret != -EOPNOTSUPP) 211 goto register_mem_failed; 212 213 vfio_pci_core_finish_enable(vdev); 214 nvdev->bar0_base = io; 215 216 return 0; 217 218 register_mem_failed: 219 if (nvdev->resmem.memlength) 220 unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); 221 error_exit: 222 vfio_pci_core_disable(vdev); 223 return ret; 224 } 225 226 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 227 { 228 struct nvgrace_gpu_pci_core_device *nvdev = 229 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 230 core_device.vdev); 231 232 nvdev->bar0_base = NULL; 233 234 if (nvdev->resmem.memlength) 235 unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); 236 237 unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); 238 239 /* Unmap the mapping to the device memory cached region */ 240 if (nvdev->usemem.memaddr) { 241 memunmap(nvdev->usemem.memaddr); 242 nvdev->usemem.memaddr = NULL; 243 } 244 245 /* Unmap the mapping to the device memory non-cached region */ 246 if (nvdev->resmem.ioaddr) { 247 iounmap(nvdev->resmem.ioaddr); 248 nvdev->resmem.ioaddr = NULL; 249 } 250 251 mutex_destroy(&nvdev->remap_lock); 252 253 vfio_pci_core_close_device(core_vdev); 254 } 255 256 static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io) 257 { 258 unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 259 260 do { 261 if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 262 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) 263 return 0; 264 if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS))) 265 return -EINTR; 266 } while (!time_after(jiffies, timeout)); 267 268 return -ETIME; 269 } 270 271 /* 272 * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low 273 * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2: 274 * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s, 275 * 101b-111b = reserved (clamped to 256s). 276 */ 277 static inline unsigned long cxl_mem_active_timeout_ms(u8 timeout) 278 { 279 return MSEC_PER_SEC << (2 * min_t(u8, timeout, 4)); 280 } 281 282 /* 283 * Check if CXL DVSEC reports memory as valid and active. 284 */ 285 static inline bool cxl_dvsec_mem_is_active(u32 status) 286 { 287 return (status & PCI_DVSEC_CXL_MEM_INFO_VALID) && 288 (status & PCI_DVSEC_CXL_MEM_ACTIVE); 289 } 290 291 static int nvgrace_gpu_test_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev, 292 u32 *status) 293 { 294 struct pci_dev *pdev = nvdev->core_device.pdev; 295 int cxl_dvsec = nvdev->cxl_dvsec; 296 u32 val; 297 298 pci_read_config_dword(pdev, 299 cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0), 300 &val); 301 302 if (val == ~0U) 303 return -ENODEV; 304 305 if (status) 306 *status = val; 307 308 if (cxl_dvsec_mem_is_active(val)) 309 return 0; 310 311 return -EAGAIN; 312 } 313 314 /* 315 * As per CXL spec r4.0 sec 8.1.3.8.2, MEM_INFO_VALID needs to be set 316 * within 1s and MEM_ACTIVE within Memory_Active_Timeout (up to ~256s) 317 * after reset and bootup. 318 */ 319 static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev) 320 { 321 unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS); 322 bool active_phase = false; 323 u32 status; 324 int ret; 325 326 for (;;) { 327 ret = nvgrace_gpu_test_device_ready_cxl(nvdev, &status); 328 if (ret != -EAGAIN) 329 return ret; 330 331 if (!active_phase && (status & PCI_DVSEC_CXL_MEM_INFO_VALID)) { 332 u8 t = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT, status); 333 334 deadline = jiffies + 335 msecs_to_jiffies(cxl_mem_active_timeout_ms(t)); 336 active_phase = true; 337 } 338 339 if (time_after(jiffies, deadline)) 340 return -ETIME; 341 342 if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS))) 343 return -EINTR; 344 } 345 } 346 347 /* 348 * If the GPU memory is accessed by the CPU while the GPU is not ready 349 * after reset, it can cause harmless corrected RAS events to be logged. 350 * Make sure the GPU is ready before establishing the mappings. 351 * 352 * Since the CXL polling wait could take 256s, it happens outside 353 * memory_lock. Only do quick readiness check under the lock. Legacy 354 * keeps the in-lock poll. 355 */ 356 static int 357 nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) 358 { 359 struct vfio_pci_core_device *vdev = &nvdev->core_device; 360 int ret; 361 362 lockdep_assert_held_read(&vdev->memory_lock); 363 364 if (!nvdev->reset_done) 365 return 0; 366 367 if (!__vfio_pci_memory_enabled(vdev)) 368 return -EIO; 369 370 if (nvdev->cxl_dvsec) 371 ret = nvgrace_gpu_test_device_ready_cxl(nvdev, NULL); 372 else 373 ret = nvgrace_gpu_wait_device_ready_legacy(nvdev->bar0_base); 374 if (ret) 375 return ret; 376 377 nvdev->reset_done = false; 378 379 return 0; 380 } 381 382 static unsigned long addr_to_pgoff(struct vm_area_struct *vma, 383 unsigned long addr) 384 { 385 u64 pgoff = vma->vm_pgoff & 386 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 387 388 return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; 389 } 390 391 static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, 392 unsigned int order) 393 { 394 struct vm_area_struct *vma = vmf->vma; 395 struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; 396 struct vfio_pci_core_device *vdev = &nvdev->core_device; 397 unsigned int index = 398 vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 399 vm_fault_t ret = VM_FAULT_FALLBACK; 400 struct mem_region *memregion; 401 unsigned long pfn, addr; 402 403 memregion = nvgrace_gpu_memregion(index, nvdev); 404 if (!memregion) 405 return VM_FAULT_SIGBUS; 406 407 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); 408 pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); 409 410 if (is_aligned_for_order(vma, addr, pfn, order)) { 411 /* 412 * Exit early under memory_lock to avoid a potentially lengthy 413 * device readiness wait on a runtime-suspended device. Any 414 * race after the lock is dropped is benign as the re-check 415 * inside the scoped guard below catches it. 416 */ 417 scoped_guard(rwsem_read, &vdev->memory_lock) { 418 if (vdev->pm_runtime_engaged) 419 return VM_FAULT_SIGBUS; 420 } 421 422 retry: 423 if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) && 424 nvgrace_gpu_wait_device_ready_cxl(nvdev)) 425 return VM_FAULT_SIGBUS; 426 427 scoped_guard(rwsem_read, &vdev->memory_lock) { 428 int rc; 429 430 if (vdev->pm_runtime_engaged) 431 return VM_FAULT_SIGBUS; 432 433 /* Re-run the wait if a reset raced us, not SIGBUS. */ 434 rc = nvgrace_gpu_check_device_ready(nvdev); 435 if (rc == -EAGAIN) 436 goto retry; 437 if (rc) 438 return VM_FAULT_SIGBUS; 439 440 ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); 441 } 442 } 443 444 dev_dbg_ratelimited(&vdev->pdev->dev, 445 "%s order = %d pfn 0x%lx: 0x%x\n", 446 __func__, order, pfn, 447 (unsigned int)ret); 448 449 return ret; 450 } 451 452 static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) 453 { 454 return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); 455 } 456 457 static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { 458 .fault = nvgrace_gpu_vfio_pci_fault, 459 #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP 460 .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, 461 #endif 462 }; 463 464 static inline 465 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma) 466 { 467 /* Check if this VMA belongs to us */ 468 if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops) 469 return NULL; 470 471 return vma->vm_private_data; 472 } 473 474 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 475 struct vm_area_struct *vma) 476 { 477 struct nvgrace_gpu_pci_core_device *nvdev = 478 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 479 core_device.vdev); 480 struct mem_region *memregion; 481 u64 req_len, pgoff, end; 482 unsigned int index; 483 484 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 485 486 memregion = nvgrace_gpu_memregion(index, nvdev); 487 if (!memregion) 488 return vfio_pci_core_mmap(core_vdev, vma); 489 490 /* 491 * Request to mmap the BAR. Map to the CPU accessible memory on the 492 * GPU using the memory information gathered from the system ACPI 493 * tables. 494 */ 495 pgoff = vma->vm_pgoff & 496 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 497 498 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 499 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 500 return -EOVERFLOW; 501 502 /* 503 * Check that the mapping request does not go beyond the exposed 504 * device memory size. 505 */ 506 if (end > memregion->memlength) 507 return -EINVAL; 508 509 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); 510 511 /* 512 * The carved out region of the device memory needs the NORMAL_NC 513 * property. Communicate as such to the hypervisor. 514 */ 515 if (index == RESMEM_REGION_INDEX) { 516 /* 517 * The nvgrace-gpu module has no issues with uncontained 518 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 519 * set to communicate to the KVM to S2 map as NORMAL_NC. 520 * This opens up guest usage of NORMAL_NC for this mapping. 521 */ 522 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 523 524 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 525 } 526 527 vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; 528 vma->vm_private_data = nvdev; 529 530 return 0; 531 } 532 533 static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 534 struct vfio_region_info *info, 535 struct vfio_info_cap *caps) 536 { 537 struct nvgrace_gpu_pci_core_device *nvdev = 538 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 539 core_device.vdev); 540 struct vfio_region_info_cap_sparse_mmap *sparse; 541 struct mem_region *memregion; 542 u32 size; 543 int ret; 544 545 /* 546 * Request to determine the BAR region information. Send the 547 * GPU memory information. 548 */ 549 memregion = nvgrace_gpu_memregion(info->index, nvdev); 550 if (!memregion) 551 return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); 552 553 size = struct_size(sparse, areas, 1); 554 555 /* 556 * Setup for sparse mapping for the device memory. Only the 557 * available device memory on the hardware is shown as a 558 * mappable region. 559 */ 560 sparse = kzalloc(size, GFP_KERNEL); 561 if (!sparse) 562 return -ENOMEM; 563 564 sparse->nr_areas = 1; 565 sparse->areas[0].offset = 0; 566 sparse->areas[0].size = memregion->memlength; 567 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 568 sparse->header.version = 1; 569 570 ret = vfio_info_add_capability(caps, &sparse->header, size); 571 kfree(sparse); 572 if (ret) 573 return ret; 574 575 info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); 576 /* 577 * The region memory size may not be power-of-2 aligned. 578 * Given that the memory is a BAR and may not be 579 * aligned, roundup to the next power-of-2. 580 */ 581 info->size = memregion->bar_size; 582 info->flags = VFIO_REGION_INFO_FLAG_READ | 583 VFIO_REGION_INFO_FLAG_WRITE | 584 VFIO_REGION_INFO_FLAG_MMAP; 585 return 0; 586 } 587 588 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 589 unsigned int cmd, unsigned long arg) 590 { 591 switch (cmd) { 592 case VFIO_DEVICE_IOEVENTFD: 593 return -ENOTTY; 594 case VFIO_DEVICE_RESET: 595 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 596 fallthrough; 597 default: 598 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 599 } 600 } 601 602 static __le64 603 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 604 { 605 u64 tmp_val; 606 607 tmp_val = le64_to_cpu(val64); 608 tmp_val &= ~(bar_size - 1); 609 tmp_val |= flags; 610 611 return cpu_to_le64(tmp_val); 612 } 613 614 /* 615 * Both the usable (usemem) and the reserved (resmem) device memory region 616 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 617 * respond to the accesses on their respective PCI config space offsets. 618 * 619 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 620 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 621 */ 622 static ssize_t 623 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 624 char __user *buf, size_t count, loff_t *ppos) 625 { 626 struct nvgrace_gpu_pci_core_device *nvdev = 627 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 628 core_device.vdev); 629 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 630 struct mem_region *memregion = NULL; 631 __le64 val64; 632 size_t register_offset; 633 loff_t copy_offset; 634 size_t copy_count; 635 int ret; 636 637 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 638 if (ret < 0) 639 return ret; 640 641 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 642 sizeof(val64), 643 ©_offset, ©_count, 644 ®ister_offset)) 645 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 646 else if (vfio_pci_core_range_intersect_range(pos, count, 647 PCI_BASE_ADDRESS_4, 648 sizeof(val64), 649 ©_offset, ©_count, 650 ®ister_offset)) 651 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 652 653 if (memregion) { 654 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 655 PCI_BASE_ADDRESS_MEM_TYPE_64 | 656 PCI_BASE_ADDRESS_MEM_PREFETCH, 657 memregion->bar_val); 658 if (copy_to_user(buf + copy_offset, 659 (void *)&val64 + register_offset, copy_count)) { 660 /* 661 * The position has been incremented in 662 * vfio_pci_core_read. Reset the offset back to the 663 * starting position. 664 */ 665 *ppos -= count; 666 return -EFAULT; 667 } 668 } 669 670 return count; 671 } 672 673 static ssize_t 674 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 675 const char __user *buf, size_t count, loff_t *ppos) 676 { 677 struct nvgrace_gpu_pci_core_device *nvdev = 678 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 679 core_device.vdev); 680 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 681 struct mem_region *memregion = NULL; 682 size_t register_offset; 683 loff_t copy_offset; 684 size_t copy_count; 685 686 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 687 sizeof(u64), ©_offset, 688 ©_count, ®ister_offset)) 689 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 690 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 691 sizeof(u64), ©_offset, 692 ©_count, ®ister_offset)) 693 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 694 695 if (memregion) { 696 if (copy_from_user((void *)&memregion->bar_val + register_offset, 697 buf + copy_offset, copy_count)) 698 return -EFAULT; 699 *ppos += copy_count; 700 return copy_count; 701 } 702 703 return vfio_pci_core_write(core_vdev, buf, count, ppos); 704 } 705 706 /* 707 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 708 * as vfio does not require the userspace driver to only perform accesses through 709 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 710 * vfio_device_ops read/write implementations. 711 * 712 * The usemem region is cacheable memory and hence is memremaped. 713 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 714 */ 715 static int 716 nvgrace_gpu_map_device_mem(int index, 717 struct nvgrace_gpu_pci_core_device *nvdev) 718 { 719 struct mem_region *memregion; 720 int ret = 0; 721 722 memregion = nvgrace_gpu_memregion(index, nvdev); 723 if (!memregion) 724 return -EINVAL; 725 726 mutex_lock(&nvdev->remap_lock); 727 728 if (memregion->memaddr) 729 goto unlock; 730 731 if (index == USEMEM_REGION_INDEX) 732 memregion->memaddr = memremap(memregion->memphys, 733 memregion->memlength, 734 MEMREMAP_WB); 735 else 736 memregion->ioaddr = ioremap_wc(memregion->memphys, 737 memregion->memlength); 738 739 if (!memregion->memaddr) 740 ret = -ENOMEM; 741 742 unlock: 743 mutex_unlock(&nvdev->remap_lock); 744 745 return ret; 746 } 747 748 /* 749 * Read the data from the device memory (mapped either through ioremap 750 * or memremap) into the user buffer. 751 */ 752 static int 753 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 754 char __user *buf, size_t mem_count, loff_t *ppos) 755 { 756 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 757 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 758 int ret; 759 760 if (!mem_count) 761 return 0; 762 763 /* 764 * Handle read on the BAR regions. Map to the target device memory 765 * physical address and copy to the request read buffer. 766 */ 767 ret = nvgrace_gpu_map_device_mem(index, nvdev); 768 if (ret) 769 return ret; 770 771 if (index == USEMEM_REGION_INDEX) { 772 if (copy_to_user(buf, 773 (u8 *)nvdev->usemem.memaddr + offset, 774 mem_count)) 775 ret = -EFAULT; 776 } else { 777 /* 778 * The hardware ensures that the system does not crash when 779 * the device memory is accessed with the memory enable 780 * turned off. It synthesizes ~0 on such read. So there is 781 * no need to check or support the disablement/enablement of 782 * BAR through PCI_COMMAND config space register. Pass 783 * test_mem flag as false. 784 */ 785 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 786 nvdev->resmem.ioaddr, 787 buf, offset, mem_count, 788 0, 0, false, VFIO_PCI_IO_WIDTH_8); 789 } 790 791 return ret; 792 } 793 794 /* 795 * Read count bytes from the device memory at an offset. The actual device 796 * memory size (available) may not be a power-of-2. So the driver fakes 797 * the size to a power-of-2 (reported) when exposing to a user space driver. 798 * 799 * Reads starting beyond the reported size generate -EINVAL; reads extending 800 * beyond the actual device size is filled with ~0; reads extending beyond 801 * the reported size are truncated. 802 */ 803 static ssize_t 804 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 805 char __user *buf, size_t count, loff_t *ppos) 806 { 807 struct vfio_pci_core_device *vdev = &nvdev->core_device; 808 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 809 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 810 struct mem_region *memregion; 811 size_t mem_count, i; 812 u8 val = 0xFF; 813 int ret; 814 815 /* No need to do NULL check as caller does. */ 816 memregion = nvgrace_gpu_memregion(index, nvdev); 817 818 if (offset >= memregion->bar_size) 819 return -EINVAL; 820 821 /* Clip short the read request beyond reported BAR size */ 822 count = min(count, memregion->bar_size - (size_t)offset); 823 824 /* 825 * Determine how many bytes to be actually read from the device memory. 826 * Read request beyond the actual device memory size is filled with ~0, 827 * while those beyond the actual reported size is skipped. 828 */ 829 if (offset >= memregion->memlength) 830 mem_count = 0; 831 else 832 mem_count = min(count, memregion->memlength - (size_t)offset); 833 834 if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) { 835 ret = nvgrace_gpu_wait_device_ready_cxl(nvdev); 836 if (ret) 837 return ret; 838 } 839 840 scoped_guard(rwsem_read, &vdev->memory_lock) { 841 ret = nvgrace_gpu_check_device_ready(nvdev); 842 if (ret) 843 return ret; 844 845 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 846 if (ret) 847 return ret; 848 } 849 850 /* 851 * Only the device memory present on the hardware is mapped, which may 852 * not be power-of-2 aligned. A read to an offset beyond the device memory 853 * size is filled with ~0. 854 */ 855 for (i = mem_count; i < count; i++) { 856 ret = put_user(val, (unsigned char __user *)(buf + i)); 857 if (ret) 858 return ret; 859 } 860 861 *ppos += count; 862 return count; 863 } 864 865 static ssize_t 866 nvgrace_gpu_read(struct vfio_device *core_vdev, 867 char __user *buf, size_t count, loff_t *ppos) 868 { 869 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 870 struct nvgrace_gpu_pci_core_device *nvdev = 871 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 872 core_device.vdev); 873 struct vfio_pci_core_device *vdev = &nvdev->core_device; 874 int ret; 875 876 if (nvgrace_gpu_memregion(index, nvdev)) { 877 if (pm_runtime_resume_and_get(&vdev->pdev->dev)) 878 return -EIO; 879 ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 880 pm_runtime_put(&vdev->pdev->dev); 881 return ret; 882 } 883 884 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 885 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 886 887 return vfio_pci_core_read(core_vdev, buf, count, ppos); 888 } 889 890 /* 891 * Write the data to the device memory (mapped either through ioremap 892 * or memremap) from the user buffer. 893 */ 894 static int 895 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 896 const char __user *buf, size_t mem_count, 897 loff_t *ppos) 898 { 899 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 900 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 901 int ret; 902 903 if (!mem_count) 904 return 0; 905 906 ret = nvgrace_gpu_map_device_mem(index, nvdev); 907 if (ret) 908 return ret; 909 910 if (index == USEMEM_REGION_INDEX) { 911 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 912 buf, mem_count)) 913 return -EFAULT; 914 } else { 915 /* 916 * The hardware ensures that the system does not crash when 917 * the device memory is accessed with the memory enable 918 * turned off. It drops such writes. So there is no need to 919 * check or support the disablement/enablement of BAR 920 * through PCI_COMMAND config space register. Pass test_mem 921 * flag as false. 922 */ 923 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 924 nvdev->resmem.ioaddr, 925 (char __user *)buf, pos, mem_count, 926 0, 0, true, VFIO_PCI_IO_WIDTH_8); 927 } 928 929 return ret; 930 } 931 932 /* 933 * Write count bytes to the device memory at a given offset. The actual device 934 * memory size (available) may not be a power-of-2. So the driver fakes the 935 * size to a power-of-2 (reported) when exposing to a user space driver. 936 * 937 * Writes extending beyond the reported size are truncated; writes starting 938 * beyond the reported size generate -EINVAL. 939 */ 940 static ssize_t 941 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 942 size_t count, loff_t *ppos, const char __user *buf) 943 { 944 struct vfio_pci_core_device *vdev = &nvdev->core_device; 945 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 946 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 947 struct mem_region *memregion; 948 size_t mem_count; 949 int ret = 0; 950 951 /* No need to do NULL check as caller does. */ 952 memregion = nvgrace_gpu_memregion(index, nvdev); 953 954 if (offset >= memregion->bar_size) 955 return -EINVAL; 956 957 /* Clip short the write request beyond reported BAR size */ 958 count = min(count, memregion->bar_size - (size_t)offset); 959 960 /* 961 * Determine how many bytes to be actually written to the device memory. 962 * Do not write to the offset beyond available size. 963 */ 964 if (offset >= memregion->memlength) 965 goto exitfn; 966 967 /* 968 * Only the device memory present on the hardware is mapped, which may 969 * not be power-of-2 aligned. Drop access outside the available device 970 * memory on the hardware. 971 */ 972 mem_count = min(count, memregion->memlength - (size_t)offset); 973 974 if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) { 975 ret = nvgrace_gpu_wait_device_ready_cxl(nvdev); 976 if (ret) 977 return ret; 978 } 979 980 scoped_guard(rwsem_read, &vdev->memory_lock) { 981 ret = nvgrace_gpu_check_device_ready(nvdev); 982 if (ret) 983 return ret; 984 985 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 986 if (ret) 987 return ret; 988 } 989 990 exitfn: 991 *ppos += count; 992 return count; 993 } 994 995 static ssize_t 996 nvgrace_gpu_write(struct vfio_device *core_vdev, 997 const char __user *buf, size_t count, loff_t *ppos) 998 { 999 struct nvgrace_gpu_pci_core_device *nvdev = 1000 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 1001 core_device.vdev); 1002 struct vfio_pci_core_device *vdev = &nvdev->core_device; 1003 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1004 int ret; 1005 1006 if (nvgrace_gpu_memregion(index, nvdev)) { 1007 if (pm_runtime_resume_and_get(&vdev->pdev->dev)) 1008 return -EIO; 1009 ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 1010 pm_runtime_put(&vdev->pdev->dev); 1011 return ret; 1012 } 1013 1014 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 1015 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 1016 1017 return vfio_pci_core_write(core_vdev, buf, count, ppos); 1018 } 1019 1020 static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, 1021 struct p2pdma_provider **provider, 1022 unsigned int region_index, 1023 struct phys_vec *phys_vec, 1024 struct vfio_region_dma_range *dma_ranges, 1025 size_t nr_ranges) 1026 { 1027 struct nvgrace_gpu_pci_core_device *nvdev = container_of( 1028 core_vdev, struct nvgrace_gpu_pci_core_device, core_device); 1029 struct pci_dev *pdev = core_vdev->pdev; 1030 struct mem_region *mem_region; 1031 1032 /* 1033 * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { 1034 * The P2P properties of the non-BAR memory is the same as the 1035 * BAR memory, so just use the provider for index 0. Someday 1036 * when CXL gets P2P support we could create CXLish providers 1037 * for the non-BAR memory. 1038 * } else if (region_index == USEMEM_REGION_INDEX) { 1039 * This is actually cachable memory and isn't treated as P2P in 1040 * the chip. For now we have no way to push cachable memory 1041 * through everything and the Grace HW doesn't care what caching 1042 * attribute is programmed into the SMMU. So use BAR 0. 1043 * } 1044 */ 1045 mem_region = nvgrace_gpu_memregion(region_index, nvdev); 1046 if (mem_region) { 1047 *provider = pcim_p2pdma_provider(pdev, 0); 1048 if (!*provider) 1049 return -EINVAL; 1050 return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, 1051 nr_ranges, 1052 mem_region->memphys, 1053 mem_region->memlength); 1054 } 1055 1056 return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, 1057 phys_vec, dma_ranges, nr_ranges); 1058 } 1059 1060 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { 1061 .get_dmabuf_phys = nvgrace_get_dmabuf_phys, 1062 }; 1063 1064 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 1065 .name = "nvgrace-gpu-vfio-pci", 1066 .init = vfio_pci_core_init_dev, 1067 .release = vfio_pci_core_release_dev, 1068 .open_device = nvgrace_gpu_open_device, 1069 .close_device = nvgrace_gpu_close_device, 1070 .ioctl = nvgrace_gpu_ioctl, 1071 .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, 1072 .device_feature = vfio_pci_core_ioctl_feature, 1073 .read = nvgrace_gpu_read, 1074 .write = nvgrace_gpu_write, 1075 .mmap = nvgrace_gpu_mmap, 1076 .request = vfio_pci_core_request, 1077 .match = vfio_pci_core_match, 1078 .match_token_uuid = vfio_pci_core_match_token_uuid, 1079 .bind_iommufd = vfio_iommufd_physical_bind, 1080 .unbind_iommufd = vfio_iommufd_physical_unbind, 1081 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1082 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1083 }; 1084 1085 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { 1086 .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, 1087 }; 1088 1089 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 1090 .name = "nvgrace-gpu-vfio-pci-core", 1091 .init = vfio_pci_core_init_dev, 1092 .release = vfio_pci_core_release_dev, 1093 .open_device = nvgrace_gpu_open_device, 1094 .close_device = vfio_pci_core_close_device, 1095 .ioctl = vfio_pci_core_ioctl, 1096 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 1097 .device_feature = vfio_pci_core_ioctl_feature, 1098 .read = vfio_pci_core_read, 1099 .write = vfio_pci_core_write, 1100 .mmap = vfio_pci_core_mmap, 1101 .request = vfio_pci_core_request, 1102 .match = vfio_pci_core_match, 1103 .match_token_uuid = vfio_pci_core_match_token_uuid, 1104 .bind_iommufd = vfio_iommufd_physical_bind, 1105 .unbind_iommufd = vfio_iommufd_physical_unbind, 1106 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1107 .detach_ioas = vfio_iommufd_physical_detach_ioas, 1108 }; 1109 1110 static int 1111 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 1112 u64 *pmemphys, u64 *pmemlength) 1113 { 1114 int ret; 1115 1116 /* 1117 * The memory information is present in the system ACPI tables as DSD 1118 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 1119 */ 1120 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 1121 pmemphys); 1122 if (ret) 1123 return ret; 1124 1125 if (*pmemphys > type_max(phys_addr_t)) 1126 return -EOVERFLOW; 1127 1128 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 1129 pmemlength); 1130 if (ret) 1131 return ret; 1132 1133 if (*pmemlength > type_max(size_t)) 1134 return -EOVERFLOW; 1135 1136 /* 1137 * If the C2C link is not up due to an error, the coherent device 1138 * memory size is returned as 0. Fail in such case. 1139 */ 1140 if (*pmemlength == 0) 1141 return -ENOMEM; 1142 1143 return ret; 1144 } 1145 1146 static int 1147 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 1148 struct nvgrace_gpu_pci_core_device *nvdev, 1149 u64 memphys, u64 memlength) 1150 { 1151 int ret = 0; 1152 u64 resmem_size = 0; 1153 1154 /* 1155 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 1156 * region to support the MIG feature owing to a hardware bug. Since the 1157 * device memory is mapped as NORMAL cached, carve out a region from the end 1158 * with a different NORMAL_NC property (called as reserved memory and 1159 * represented as resmem). This region then is exposed as a 64b BAR 1160 * (region 2 and 3) to the VM, while exposing the rest (termed as usable 1161 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 1162 * 1163 * devmem (memlength) 1164 * |-------------------------------------------------| 1165 * | | 1166 * usemem.memphys resmem.memphys 1167 * 1168 * This hardware bug is fixed on the Grace Blackwell platforms and the 1169 * presence of the bug can be determined through nvdev->has_mig_hw_bug. 1170 * Thus on systems with the hardware fix, there is no need to partition 1171 * the GPU device memory and the entire memory is usable and mapped as 1172 * NORMAL cached (i.e. resmem size is 0). 1173 */ 1174 if (nvdev->has_mig_hw_bug) 1175 resmem_size = SZ_1G; 1176 1177 nvdev->usemem.memphys = memphys; 1178 1179 /* 1180 * The device memory exposed to the VM is added to the kernel by the 1181 * VM driver module in chunks of memory block size. Note that only the 1182 * usable memory (usemem) is added to the kernel for usage by the VM 1183 * workloads. 1184 */ 1185 if (check_sub_overflow(memlength, resmem_size, 1186 &nvdev->usemem.memlength)) { 1187 ret = -EOVERFLOW; 1188 goto done; 1189 } 1190 1191 /* 1192 * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 1193 * Calculate and save the BAR size for the region. 1194 */ 1195 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 1196 1197 /* 1198 * If the hardware has the fix for MIG, there is no requirement 1199 * for splitting the device memory to create RESMEM. The entire 1200 * device memory is usable and will be USEMEM. Return here for 1201 * such case. 1202 */ 1203 if (!nvdev->has_mig_hw_bug) 1204 goto done; 1205 1206 /* 1207 * When the device memory is split to workaround the MIG bug on 1208 * Grace Hopper, the USEMEM part of the device memory has to be 1209 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 1210 * GPU FW and VFIO driver. The VM device driver is also aware of it 1211 * and make use of the value for its calculation to determine USEMEM 1212 * size. Note that the device memory may not be 512M aligned. 1213 */ 1214 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 1215 MEMBLK_SIZE); 1216 if (nvdev->usemem.memlength == 0) { 1217 ret = -EINVAL; 1218 goto done; 1219 } 1220 1221 if ((check_add_overflow(nvdev->usemem.memphys, 1222 nvdev->usemem.memlength, 1223 &nvdev->resmem.memphys)) || 1224 (check_sub_overflow(memlength, nvdev->usemem.memlength, 1225 &nvdev->resmem.memlength))) { 1226 ret = -EOVERFLOW; 1227 goto done; 1228 } 1229 1230 /* 1231 * The resmem region is exposed as a 64b BAR composed of region 2 and 3 1232 * for Grace Hopper. Calculate and save the BAR size for the region. 1233 */ 1234 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 1235 done: 1236 return ret; 1237 } 1238 1239 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 1240 { 1241 int pcie_dvsec; 1242 u16 dvsec_ctrl16; 1243 1244 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 1245 GPU_CAP_DVSEC_REGISTER); 1246 1247 if (pcie_dvsec) { 1248 pci_read_config_word(pdev, 1249 pcie_dvsec + DVSEC_BITMAP_OFFSET, 1250 &dvsec_ctrl16); 1251 1252 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 1253 return false; 1254 } 1255 1256 return true; 1257 } 1258 1259 /* 1260 * To reduce the system bootup time, the HBM training has 1261 * been moved out of the UEFI on the Grace-Blackwell systems. 1262 * 1263 * The onus of checking whether the HBM training has completed 1264 * thus falls on the module. The HBM training status can be 1265 * determined from a BAR0 register. 1266 * 1267 * Similarly, another BAR0 register exposes the status of the 1268 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 1269 * 1270 * Poll these register and check for 30s. If the HBM training is 1271 * not complete or if the C2C link is not ready, fail the probe. 1272 * 1273 * While the wait is not required on Grace Hopper systems, it 1274 * is beneficial to make the check to ensure the device is in an 1275 * expected state. 1276 * 1277 * On Blackwell-Next systems, memory readiness is determined via the 1278 * CXL Device DVSEC in PCI config space and does not require BAR0. 1279 * For the legacy path, ensure BAR0 is enabled before accessing the 1280 * registers. 1281 */ 1282 static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) 1283 { 1284 struct pci_dev *pdev = nvdev->core_device.pdev; 1285 void __iomem *io; 1286 int ret; 1287 1288 /* 1289 * Note that the worst-case wait here is ~256s (vs ~30s on the 1290 * legacy path) and may block device unbind/sysfs for the duration. 1291 */ 1292 if (nvdev->cxl_dvsec) 1293 return nvgrace_gpu_wait_device_ready_cxl(nvdev); 1294 1295 ret = pci_enable_device(pdev); 1296 if (ret) 1297 return ret; 1298 1299 ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 1300 if (ret) 1301 goto request_region_exit; 1302 1303 io = pci_iomap(pdev, 0, 0); 1304 if (!io) { 1305 ret = -ENOMEM; 1306 goto iomap_exit; 1307 } 1308 1309 ret = nvgrace_gpu_wait_device_ready_legacy(io); 1310 1311 pci_iounmap(pdev, io); 1312 iomap_exit: 1313 pci_release_selected_regions(pdev, 1 << 0); 1314 request_region_exit: 1315 pci_disable_device(pdev); 1316 return ret; 1317 } 1318 1319 static int nvgrace_gpu_probe(struct pci_dev *pdev, 1320 const struct pci_device_id *id) 1321 { 1322 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 1323 struct nvgrace_gpu_pci_core_device *nvdev; 1324 u64 memphys, memlength; 1325 int ret; 1326 1327 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 1328 if (!ret) 1329 ops = &nvgrace_gpu_pci_ops; 1330 1331 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 1332 &pdev->dev, ops); 1333 if (IS_ERR(nvdev)) 1334 return PTR_ERR(nvdev); 1335 1336 nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, 1337 PCI_DVSEC_CXL_DEVICE); 1338 1339 ret = nvgrace_gpu_probe_check_device_ready(nvdev); 1340 if (ret) 1341 goto out_put_vdev; 1342 1343 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 1344 1345 if (ops == &nvgrace_gpu_pci_ops) { 1346 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 1347 1348 /* 1349 * Device memory properties are identified in the host ACPI 1350 * table. Set the nvgrace_gpu_pci_core_device structure. 1351 */ 1352 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 1353 memphys, memlength); 1354 if (ret) 1355 goto out_put_vdev; 1356 nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; 1357 } else { 1358 nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; 1359 } 1360 1361 ret = vfio_pci_core_register_device(&nvdev->core_device); 1362 if (ret) 1363 goto out_put_vdev; 1364 1365 return ret; 1366 1367 out_put_vdev: 1368 vfio_put_device(&nvdev->core_device.vdev); 1369 return ret; 1370 } 1371 1372 static void nvgrace_gpu_remove(struct pci_dev *pdev) 1373 { 1374 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 1375 1376 vfio_pci_core_unregister_device(core_device); 1377 vfio_put_device(&core_device->vdev); 1378 } 1379 1380 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 1381 /* GH200 120GB */ 1382 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 1383 /* GH200 480GB */ 1384 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 1385 /* GH200 SKU */ 1386 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 1387 /* GB200 SKU */ 1388 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 1389 /* GB300 SKU */ 1390 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) }, 1391 {} 1392 }; 1393 1394 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 1395 1396 /* 1397 * The GPU reset is required to be serialized against the *first* mapping 1398 * faults and read/writes accesses to prevent potential RAS events logging. 1399 * 1400 * First fault or access after a reset needs to poll device readiness, 1401 * flag that a reset has occurred. The readiness test is done by holding 1402 * the memory_lock read lock and we expect all vfio-pci initiated resets to 1403 * hold the memory_lock write lock to avoid races. However, .reset_done 1404 * extends beyond the scope of vfio-pci initiated resets therefore we 1405 * cannot assert this behavior and use lockdep_assert_held_write. 1406 */ 1407 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) 1408 { 1409 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 1410 struct nvgrace_gpu_pci_core_device *nvdev = 1411 container_of(core_device, struct nvgrace_gpu_pci_core_device, 1412 core_device); 1413 1414 nvdev->reset_done = true; 1415 } 1416 1417 static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { 1418 .reset_done = nvgrace_gpu_vfio_pci_reset_done, 1419 .error_detected = vfio_pci_core_aer_err_detected, 1420 }; 1421 1422 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 1423 .name = KBUILD_MODNAME, 1424 .id_table = nvgrace_gpu_vfio_pci_table, 1425 .probe = nvgrace_gpu_probe, 1426 .remove = nvgrace_gpu_remove, 1427 .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, 1428 .driver_managed_dma = true, 1429 }; 1430 1431 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 1432 1433 MODULE_LICENSE("GPL"); 1434 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 1435 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 1436 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 1437