1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/sizes.h> 7 #include <linux/vfio_pci_core.h> 8 #include <linux/delay.h> 9 #include <linux/jiffies.h> 10 #include <linux/pci-p2pdma.h> 11 #include <linux/pm_runtime.h> 12 13 /* 14 * The device memory usable to the workloads running in the VM is cached 15 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 16 * to the VM and is represented as usemem. 17 * Moreover, the VM GPU device driver needs a non-cacheable region to 18 * support the MIG feature. This region is also exposed as a 64b BAR 19 * (comprising of BAR2 and BAR3 region) and represented as resmem. 20 */ 21 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 22 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 23 24 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 25 #define MEMBLK_SIZE SZ_512M 26 27 #define DVSEC_BITMAP_OFFSET 0xA 28 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 29 30 #define GPU_CAP_DVSEC_REGISTER 3 31 32 #define C2C_LINK_BAR0_OFFSET 0x1498 33 #define HBM_TRAINING_BAR0_OFFSET 0x200BC 34 #define STATUS_READY 0xFF 35 36 #define POLL_QUANTUM_MS 1000 37 #define POLL_TIMEOUT_MS (30 * 1000) 38 39 /* 40 * The state of the two device memory region - resmem and usemem - is 41 * saved as struct mem_region. 42 */ 43 struct mem_region { 44 phys_addr_t memphys; /* Base physical address of the region */ 45 size_t memlength; /* Region size */ 46 size_t bar_size; /* Reported region BAR size */ 47 __le64 bar_val; /* Emulated BAR offset registers */ 48 union { 49 void *memaddr; 50 void __iomem *ioaddr; 51 }; /* Base virtual address of the region */ 52 }; 53 54 struct nvgrace_gpu_pci_core_device { 55 struct vfio_pci_core_device core_device; 56 /* Cached and usable memory for the VM. */ 57 struct mem_region usemem; 58 /* Non cached memory carved out from the end of device memory */ 59 struct mem_region resmem; 60 /* Lock to control device memory kernel mapping */ 61 struct mutex remap_lock; 62 bool has_mig_hw_bug; 63 /* GPU has just been reset */ 64 bool reset_done; 65 }; 66 67 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 68 { 69 struct nvgrace_gpu_pci_core_device *nvdev = 70 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 71 core_device.vdev); 72 73 nvdev->resmem.bar_val = 0; 74 nvdev->usemem.bar_val = 0; 75 } 76 77 /* Choose the structure corresponding to the fake BAR with a given index. */ 78 static struct mem_region * 79 nvgrace_gpu_memregion(int index, 80 struct nvgrace_gpu_pci_core_device *nvdev) 81 { 82 if (index == USEMEM_REGION_INDEX) 83 return &nvdev->usemem; 84 85 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 86 return &nvdev->resmem; 87 88 return NULL; 89 } 90 91 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 92 { 93 struct vfio_pci_core_device *vdev = 94 container_of(core_vdev, struct vfio_pci_core_device, vdev); 95 struct nvgrace_gpu_pci_core_device *nvdev = 96 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 97 core_device.vdev); 98 int ret; 99 100 ret = vfio_pci_core_enable(vdev); 101 if (ret) 102 return ret; 103 104 if (nvdev->usemem.memlength) { 105 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 106 mutex_init(&nvdev->remap_lock); 107 } 108 109 /* 110 * GPU readiness is checked by reading the BAR0 registers. 111 * 112 * ioremap BAR0 to ensure that the BAR0 mapping is present before 113 * register reads on first fault before establishing any GPU 114 * memory mapping. 115 */ 116 ret = vfio_pci_core_setup_barmap(vdev, 0); 117 if (ret) { 118 vfio_pci_core_disable(vdev); 119 return ret; 120 } 121 122 vfio_pci_core_finish_enable(vdev); 123 124 return 0; 125 } 126 127 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 128 { 129 struct nvgrace_gpu_pci_core_device *nvdev = 130 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 131 core_device.vdev); 132 133 /* Unmap the mapping to the device memory cached region */ 134 if (nvdev->usemem.memaddr) { 135 memunmap(nvdev->usemem.memaddr); 136 nvdev->usemem.memaddr = NULL; 137 } 138 139 /* Unmap the mapping to the device memory non-cached region */ 140 if (nvdev->resmem.ioaddr) { 141 iounmap(nvdev->resmem.ioaddr); 142 nvdev->resmem.ioaddr = NULL; 143 } 144 145 mutex_destroy(&nvdev->remap_lock); 146 147 vfio_pci_core_close_device(core_vdev); 148 } 149 150 static int nvgrace_gpu_wait_device_ready(void __iomem *io) 151 { 152 unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 153 154 do { 155 if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 156 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) 157 return 0; 158 msleep(POLL_QUANTUM_MS); 159 } while (!time_after(jiffies, timeout)); 160 161 return -ETIME; 162 } 163 164 /* 165 * If the GPU memory is accessed by the CPU while the GPU is not ready 166 * after reset, it can cause harmless corrected RAS events to be logged. 167 * Make sure the GPU is ready before establishing the mappings. 168 */ 169 static int 170 nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) 171 { 172 struct vfio_pci_core_device *vdev = &nvdev->core_device; 173 int ret; 174 175 lockdep_assert_held_read(&vdev->memory_lock); 176 177 if (!nvdev->reset_done) 178 return 0; 179 180 if (!__vfio_pci_memory_enabled(vdev)) 181 return -EIO; 182 183 ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); 184 if (ret) 185 return ret; 186 187 nvdev->reset_done = false; 188 189 return 0; 190 } 191 192 static unsigned long addr_to_pgoff(struct vm_area_struct *vma, 193 unsigned long addr) 194 { 195 u64 pgoff = vma->vm_pgoff & 196 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 197 198 return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; 199 } 200 201 static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, 202 unsigned int order) 203 { 204 struct vm_area_struct *vma = vmf->vma; 205 struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; 206 struct vfio_pci_core_device *vdev = &nvdev->core_device; 207 unsigned int index = 208 vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 209 vm_fault_t ret = VM_FAULT_FALLBACK; 210 struct mem_region *memregion; 211 unsigned long pfn, addr; 212 213 memregion = nvgrace_gpu_memregion(index, nvdev); 214 if (!memregion) 215 return VM_FAULT_SIGBUS; 216 217 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); 218 pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); 219 220 if (is_aligned_for_order(vma, addr, pfn, order)) { 221 scoped_guard(rwsem_read, &vdev->memory_lock) { 222 if (vdev->pm_runtime_engaged || 223 nvgrace_gpu_check_device_ready(nvdev)) 224 return VM_FAULT_SIGBUS; 225 226 ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); 227 } 228 } 229 230 dev_dbg_ratelimited(&vdev->pdev->dev, 231 "%s order = %d pfn 0x%lx: 0x%x\n", 232 __func__, order, pfn, 233 (unsigned int)ret); 234 235 return ret; 236 } 237 238 static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) 239 { 240 return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); 241 } 242 243 static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { 244 .fault = nvgrace_gpu_vfio_pci_fault, 245 #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP 246 .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, 247 #endif 248 }; 249 250 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 251 struct vm_area_struct *vma) 252 { 253 struct nvgrace_gpu_pci_core_device *nvdev = 254 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 255 core_device.vdev); 256 struct mem_region *memregion; 257 u64 req_len, pgoff, end; 258 unsigned int index; 259 260 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 261 262 memregion = nvgrace_gpu_memregion(index, nvdev); 263 if (!memregion) 264 return vfio_pci_core_mmap(core_vdev, vma); 265 266 /* 267 * Request to mmap the BAR. Map to the CPU accessible memory on the 268 * GPU using the memory information gathered from the system ACPI 269 * tables. 270 */ 271 pgoff = vma->vm_pgoff & 272 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 273 274 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 275 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 276 return -EOVERFLOW; 277 278 /* 279 * Check that the mapping request does not go beyond the exposed 280 * device memory size. 281 */ 282 if (end > memregion->memlength) 283 return -EINVAL; 284 285 vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); 286 287 /* 288 * The carved out region of the device memory needs the NORMAL_NC 289 * property. Communicate as such to the hypervisor. 290 */ 291 if (index == RESMEM_REGION_INDEX) { 292 /* 293 * The nvgrace-gpu module has no issues with uncontained 294 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 295 * set to communicate to the KVM to S2 map as NORMAL_NC. 296 * This opens up guest usage of NORMAL_NC for this mapping. 297 */ 298 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 299 300 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 301 } 302 303 vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; 304 vma->vm_private_data = nvdev; 305 306 return 0; 307 } 308 309 static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 310 struct vfio_region_info *info, 311 struct vfio_info_cap *caps) 312 { 313 struct nvgrace_gpu_pci_core_device *nvdev = 314 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 315 core_device.vdev); 316 struct vfio_region_info_cap_sparse_mmap *sparse; 317 struct mem_region *memregion; 318 u32 size; 319 int ret; 320 321 /* 322 * Request to determine the BAR region information. Send the 323 * GPU memory information. 324 */ 325 memregion = nvgrace_gpu_memregion(info->index, nvdev); 326 if (!memregion) 327 return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); 328 329 size = struct_size(sparse, areas, 1); 330 331 /* 332 * Setup for sparse mapping for the device memory. Only the 333 * available device memory on the hardware is shown as a 334 * mappable region. 335 */ 336 sparse = kzalloc(size, GFP_KERNEL); 337 if (!sparse) 338 return -ENOMEM; 339 340 sparse->nr_areas = 1; 341 sparse->areas[0].offset = 0; 342 sparse->areas[0].size = memregion->memlength; 343 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 344 sparse->header.version = 1; 345 346 ret = vfio_info_add_capability(caps, &sparse->header, size); 347 kfree(sparse); 348 if (ret) 349 return ret; 350 351 info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); 352 /* 353 * The region memory size may not be power-of-2 aligned. 354 * Given that the memory is a BAR and may not be 355 * aligned, roundup to the next power-of-2. 356 */ 357 info->size = memregion->bar_size; 358 info->flags = VFIO_REGION_INFO_FLAG_READ | 359 VFIO_REGION_INFO_FLAG_WRITE | 360 VFIO_REGION_INFO_FLAG_MMAP; 361 return 0; 362 } 363 364 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 365 unsigned int cmd, unsigned long arg) 366 { 367 switch (cmd) { 368 case VFIO_DEVICE_IOEVENTFD: 369 return -ENOTTY; 370 case VFIO_DEVICE_RESET: 371 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 372 fallthrough; 373 default: 374 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 375 } 376 } 377 378 static __le64 379 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 380 { 381 u64 tmp_val; 382 383 tmp_val = le64_to_cpu(val64); 384 tmp_val &= ~(bar_size - 1); 385 tmp_val |= flags; 386 387 return cpu_to_le64(tmp_val); 388 } 389 390 /* 391 * Both the usable (usemem) and the reserved (resmem) device memory region 392 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 393 * respond to the accesses on their respective PCI config space offsets. 394 * 395 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 396 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 397 */ 398 static ssize_t 399 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 400 char __user *buf, size_t count, loff_t *ppos) 401 { 402 struct nvgrace_gpu_pci_core_device *nvdev = 403 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 404 core_device.vdev); 405 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 406 struct mem_region *memregion = NULL; 407 __le64 val64; 408 size_t register_offset; 409 loff_t copy_offset; 410 size_t copy_count; 411 int ret; 412 413 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 414 if (ret < 0) 415 return ret; 416 417 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 418 sizeof(val64), 419 ©_offset, ©_count, 420 ®ister_offset)) 421 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 422 else if (vfio_pci_core_range_intersect_range(pos, count, 423 PCI_BASE_ADDRESS_4, 424 sizeof(val64), 425 ©_offset, ©_count, 426 ®ister_offset)) 427 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 428 429 if (memregion) { 430 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 431 PCI_BASE_ADDRESS_MEM_TYPE_64 | 432 PCI_BASE_ADDRESS_MEM_PREFETCH, 433 memregion->bar_val); 434 if (copy_to_user(buf + copy_offset, 435 (void *)&val64 + register_offset, copy_count)) { 436 /* 437 * The position has been incremented in 438 * vfio_pci_core_read. Reset the offset back to the 439 * starting position. 440 */ 441 *ppos -= count; 442 return -EFAULT; 443 } 444 } 445 446 return count; 447 } 448 449 static ssize_t 450 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 451 const char __user *buf, size_t count, loff_t *ppos) 452 { 453 struct nvgrace_gpu_pci_core_device *nvdev = 454 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 455 core_device.vdev); 456 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 457 struct mem_region *memregion = NULL; 458 size_t register_offset; 459 loff_t copy_offset; 460 size_t copy_count; 461 462 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 463 sizeof(u64), ©_offset, 464 ©_count, ®ister_offset)) 465 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 466 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 467 sizeof(u64), ©_offset, 468 ©_count, ®ister_offset)) 469 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 470 471 if (memregion) { 472 if (copy_from_user((void *)&memregion->bar_val + register_offset, 473 buf + copy_offset, copy_count)) 474 return -EFAULT; 475 *ppos += copy_count; 476 return copy_count; 477 } 478 479 return vfio_pci_core_write(core_vdev, buf, count, ppos); 480 } 481 482 /* 483 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 484 * as vfio does not require the userspace driver to only perform accesses through 485 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 486 * vfio_device_ops read/write implementations. 487 * 488 * The usemem region is cacheable memory and hence is memremaped. 489 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 490 */ 491 static int 492 nvgrace_gpu_map_device_mem(int index, 493 struct nvgrace_gpu_pci_core_device *nvdev) 494 { 495 struct mem_region *memregion; 496 int ret = 0; 497 498 memregion = nvgrace_gpu_memregion(index, nvdev); 499 if (!memregion) 500 return -EINVAL; 501 502 mutex_lock(&nvdev->remap_lock); 503 504 if (memregion->memaddr) 505 goto unlock; 506 507 if (index == USEMEM_REGION_INDEX) 508 memregion->memaddr = memremap(memregion->memphys, 509 memregion->memlength, 510 MEMREMAP_WB); 511 else 512 memregion->ioaddr = ioremap_wc(memregion->memphys, 513 memregion->memlength); 514 515 if (!memregion->memaddr) 516 ret = -ENOMEM; 517 518 unlock: 519 mutex_unlock(&nvdev->remap_lock); 520 521 return ret; 522 } 523 524 /* 525 * Read the data from the device memory (mapped either through ioremap 526 * or memremap) into the user buffer. 527 */ 528 static int 529 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 530 char __user *buf, size_t mem_count, loff_t *ppos) 531 { 532 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 533 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 534 int ret; 535 536 if (!mem_count) 537 return 0; 538 539 /* 540 * Handle read on the BAR regions. Map to the target device memory 541 * physical address and copy to the request read buffer. 542 */ 543 ret = nvgrace_gpu_map_device_mem(index, nvdev); 544 if (ret) 545 return ret; 546 547 if (index == USEMEM_REGION_INDEX) { 548 if (copy_to_user(buf, 549 (u8 *)nvdev->usemem.memaddr + offset, 550 mem_count)) 551 ret = -EFAULT; 552 } else { 553 /* 554 * The hardware ensures that the system does not crash when 555 * the device memory is accessed with the memory enable 556 * turned off. It synthesizes ~0 on such read. So there is 557 * no need to check or support the disablement/enablement of 558 * BAR through PCI_COMMAND config space register. Pass 559 * test_mem flag as false. 560 */ 561 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 562 nvdev->resmem.ioaddr, 563 buf, offset, mem_count, 564 0, 0, false, VFIO_PCI_IO_WIDTH_8); 565 } 566 567 return ret; 568 } 569 570 /* 571 * Read count bytes from the device memory at an offset. The actual device 572 * memory size (available) may not be a power-of-2. So the driver fakes 573 * the size to a power-of-2 (reported) when exposing to a user space driver. 574 * 575 * Reads starting beyond the reported size generate -EINVAL; reads extending 576 * beyond the actual device size is filled with ~0; reads extending beyond 577 * the reported size are truncated. 578 */ 579 static ssize_t 580 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 581 char __user *buf, size_t count, loff_t *ppos) 582 { 583 struct vfio_pci_core_device *vdev = &nvdev->core_device; 584 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 585 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 586 struct mem_region *memregion; 587 size_t mem_count, i; 588 u8 val = 0xFF; 589 int ret; 590 591 /* No need to do NULL check as caller does. */ 592 memregion = nvgrace_gpu_memregion(index, nvdev); 593 594 if (offset >= memregion->bar_size) 595 return -EINVAL; 596 597 /* Clip short the read request beyond reported BAR size */ 598 count = min(count, memregion->bar_size - (size_t)offset); 599 600 /* 601 * Determine how many bytes to be actually read from the device memory. 602 * Read request beyond the actual device memory size is filled with ~0, 603 * while those beyond the actual reported size is skipped. 604 */ 605 if (offset >= memregion->memlength) 606 mem_count = 0; 607 else 608 mem_count = min(count, memregion->memlength - (size_t)offset); 609 610 scoped_guard(rwsem_read, &vdev->memory_lock) { 611 ret = nvgrace_gpu_check_device_ready(nvdev); 612 if (ret) 613 return ret; 614 615 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 616 if (ret) 617 return ret; 618 } 619 620 /* 621 * Only the device memory present on the hardware is mapped, which may 622 * not be power-of-2 aligned. A read to an offset beyond the device memory 623 * size is filled with ~0. 624 */ 625 for (i = mem_count; i < count; i++) { 626 ret = put_user(val, (unsigned char __user *)(buf + i)); 627 if (ret) 628 return ret; 629 } 630 631 *ppos += count; 632 return count; 633 } 634 635 static ssize_t 636 nvgrace_gpu_read(struct vfio_device *core_vdev, 637 char __user *buf, size_t count, loff_t *ppos) 638 { 639 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 640 struct nvgrace_gpu_pci_core_device *nvdev = 641 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 642 core_device.vdev); 643 struct vfio_pci_core_device *vdev = &nvdev->core_device; 644 int ret; 645 646 if (nvgrace_gpu_memregion(index, nvdev)) { 647 if (pm_runtime_resume_and_get(&vdev->pdev->dev)) 648 return -EIO; 649 ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 650 pm_runtime_put(&vdev->pdev->dev); 651 return ret; 652 } 653 654 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 655 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 656 657 return vfio_pci_core_read(core_vdev, buf, count, ppos); 658 } 659 660 /* 661 * Write the data to the device memory (mapped either through ioremap 662 * or memremap) from the user buffer. 663 */ 664 static int 665 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 666 const char __user *buf, size_t mem_count, 667 loff_t *ppos) 668 { 669 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 670 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 671 int ret; 672 673 if (!mem_count) 674 return 0; 675 676 ret = nvgrace_gpu_map_device_mem(index, nvdev); 677 if (ret) 678 return ret; 679 680 if (index == USEMEM_REGION_INDEX) { 681 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 682 buf, mem_count)) 683 return -EFAULT; 684 } else { 685 /* 686 * The hardware ensures that the system does not crash when 687 * the device memory is accessed with the memory enable 688 * turned off. It drops such writes. So there is no need to 689 * check or support the disablement/enablement of BAR 690 * through PCI_COMMAND config space register. Pass test_mem 691 * flag as false. 692 */ 693 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 694 nvdev->resmem.ioaddr, 695 (char __user *)buf, pos, mem_count, 696 0, 0, true, VFIO_PCI_IO_WIDTH_8); 697 } 698 699 return ret; 700 } 701 702 /* 703 * Write count bytes to the device memory at a given offset. The actual device 704 * memory size (available) may not be a power-of-2. So the driver fakes the 705 * size to a power-of-2 (reported) when exposing to a user space driver. 706 * 707 * Writes extending beyond the reported size are truncated; writes starting 708 * beyond the reported size generate -EINVAL. 709 */ 710 static ssize_t 711 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 712 size_t count, loff_t *ppos, const char __user *buf) 713 { 714 struct vfio_pci_core_device *vdev = &nvdev->core_device; 715 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 716 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 717 struct mem_region *memregion; 718 size_t mem_count; 719 int ret = 0; 720 721 /* No need to do NULL check as caller does. */ 722 memregion = nvgrace_gpu_memregion(index, nvdev); 723 724 if (offset >= memregion->bar_size) 725 return -EINVAL; 726 727 /* Clip short the write request beyond reported BAR size */ 728 count = min(count, memregion->bar_size - (size_t)offset); 729 730 /* 731 * Determine how many bytes to be actually written to the device memory. 732 * Do not write to the offset beyond available size. 733 */ 734 if (offset >= memregion->memlength) 735 goto exitfn; 736 737 /* 738 * Only the device memory present on the hardware is mapped, which may 739 * not be power-of-2 aligned. Drop access outside the available device 740 * memory on the hardware. 741 */ 742 mem_count = min(count, memregion->memlength - (size_t)offset); 743 744 scoped_guard(rwsem_read, &vdev->memory_lock) { 745 ret = nvgrace_gpu_check_device_ready(nvdev); 746 if (ret) 747 return ret; 748 749 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 750 if (ret) 751 return ret; 752 } 753 754 exitfn: 755 *ppos += count; 756 return count; 757 } 758 759 static ssize_t 760 nvgrace_gpu_write(struct vfio_device *core_vdev, 761 const char __user *buf, size_t count, loff_t *ppos) 762 { 763 struct nvgrace_gpu_pci_core_device *nvdev = 764 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 765 core_device.vdev); 766 struct vfio_pci_core_device *vdev = &nvdev->core_device; 767 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 768 int ret; 769 770 if (nvgrace_gpu_memregion(index, nvdev)) { 771 if (pm_runtime_resume_and_get(&vdev->pdev->dev)) 772 return -EIO; 773 ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 774 pm_runtime_put(&vdev->pdev->dev); 775 return ret; 776 } 777 778 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 779 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 780 781 return vfio_pci_core_write(core_vdev, buf, count, ppos); 782 } 783 784 static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, 785 struct p2pdma_provider **provider, 786 unsigned int region_index, 787 struct dma_buf_phys_vec *phys_vec, 788 struct vfio_region_dma_range *dma_ranges, 789 size_t nr_ranges) 790 { 791 struct nvgrace_gpu_pci_core_device *nvdev = container_of( 792 core_vdev, struct nvgrace_gpu_pci_core_device, core_device); 793 struct pci_dev *pdev = core_vdev->pdev; 794 struct mem_region *mem_region; 795 796 /* 797 * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { 798 * The P2P properties of the non-BAR memory is the same as the 799 * BAR memory, so just use the provider for index 0. Someday 800 * when CXL gets P2P support we could create CXLish providers 801 * for the non-BAR memory. 802 * } else if (region_index == USEMEM_REGION_INDEX) { 803 * This is actually cachable memory and isn't treated as P2P in 804 * the chip. For now we have no way to push cachable memory 805 * through everything and the Grace HW doesn't care what caching 806 * attribute is programmed into the SMMU. So use BAR 0. 807 * } 808 */ 809 mem_region = nvgrace_gpu_memregion(region_index, nvdev); 810 if (mem_region) { 811 *provider = pcim_p2pdma_provider(pdev, 0); 812 if (!*provider) 813 return -EINVAL; 814 return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, 815 nr_ranges, 816 mem_region->memphys, 817 mem_region->memlength); 818 } 819 820 return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, 821 phys_vec, dma_ranges, nr_ranges); 822 } 823 824 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { 825 .get_dmabuf_phys = nvgrace_get_dmabuf_phys, 826 }; 827 828 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 829 .name = "nvgrace-gpu-vfio-pci", 830 .init = vfio_pci_core_init_dev, 831 .release = vfio_pci_core_release_dev, 832 .open_device = nvgrace_gpu_open_device, 833 .close_device = nvgrace_gpu_close_device, 834 .ioctl = nvgrace_gpu_ioctl, 835 .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, 836 .device_feature = vfio_pci_core_ioctl_feature, 837 .read = nvgrace_gpu_read, 838 .write = nvgrace_gpu_write, 839 .mmap = nvgrace_gpu_mmap, 840 .request = vfio_pci_core_request, 841 .match = vfio_pci_core_match, 842 .match_token_uuid = vfio_pci_core_match_token_uuid, 843 .bind_iommufd = vfio_iommufd_physical_bind, 844 .unbind_iommufd = vfio_iommufd_physical_unbind, 845 .attach_ioas = vfio_iommufd_physical_attach_ioas, 846 .detach_ioas = vfio_iommufd_physical_detach_ioas, 847 }; 848 849 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { 850 .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, 851 }; 852 853 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 854 .name = "nvgrace-gpu-vfio-pci-core", 855 .init = vfio_pci_core_init_dev, 856 .release = vfio_pci_core_release_dev, 857 .open_device = nvgrace_gpu_open_device, 858 .close_device = vfio_pci_core_close_device, 859 .ioctl = vfio_pci_core_ioctl, 860 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 861 .device_feature = vfio_pci_core_ioctl_feature, 862 .read = vfio_pci_core_read, 863 .write = vfio_pci_core_write, 864 .mmap = vfio_pci_core_mmap, 865 .request = vfio_pci_core_request, 866 .match = vfio_pci_core_match, 867 .match_token_uuid = vfio_pci_core_match_token_uuid, 868 .bind_iommufd = vfio_iommufd_physical_bind, 869 .unbind_iommufd = vfio_iommufd_physical_unbind, 870 .attach_ioas = vfio_iommufd_physical_attach_ioas, 871 .detach_ioas = vfio_iommufd_physical_detach_ioas, 872 }; 873 874 static int 875 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 876 u64 *pmemphys, u64 *pmemlength) 877 { 878 int ret; 879 880 /* 881 * The memory information is present in the system ACPI tables as DSD 882 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 883 */ 884 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 885 pmemphys); 886 if (ret) 887 return ret; 888 889 if (*pmemphys > type_max(phys_addr_t)) 890 return -EOVERFLOW; 891 892 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 893 pmemlength); 894 if (ret) 895 return ret; 896 897 if (*pmemlength > type_max(size_t)) 898 return -EOVERFLOW; 899 900 /* 901 * If the C2C link is not up due to an error, the coherent device 902 * memory size is returned as 0. Fail in such case. 903 */ 904 if (*pmemlength == 0) 905 return -ENOMEM; 906 907 return ret; 908 } 909 910 static int 911 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 912 struct nvgrace_gpu_pci_core_device *nvdev, 913 u64 memphys, u64 memlength) 914 { 915 int ret = 0; 916 u64 resmem_size = 0; 917 918 /* 919 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 920 * region to support the MIG feature owing to a hardware bug. Since the 921 * device memory is mapped as NORMAL cached, carve out a region from the end 922 * with a different NORMAL_NC property (called as reserved memory and 923 * represented as resmem). This region then is exposed as a 64b BAR 924 * (region 2 and 3) to the VM, while exposing the rest (termed as usable 925 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 926 * 927 * devmem (memlength) 928 * |-------------------------------------------------| 929 * | | 930 * usemem.memphys resmem.memphys 931 * 932 * This hardware bug is fixed on the Grace Blackwell platforms and the 933 * presence of the bug can be determined through nvdev->has_mig_hw_bug. 934 * Thus on systems with the hardware fix, there is no need to partition 935 * the GPU device memory and the entire memory is usable and mapped as 936 * NORMAL cached (i.e. resmem size is 0). 937 */ 938 if (nvdev->has_mig_hw_bug) 939 resmem_size = SZ_1G; 940 941 nvdev->usemem.memphys = memphys; 942 943 /* 944 * The device memory exposed to the VM is added to the kernel by the 945 * VM driver module in chunks of memory block size. Note that only the 946 * usable memory (usemem) is added to the kernel for usage by the VM 947 * workloads. 948 */ 949 if (check_sub_overflow(memlength, resmem_size, 950 &nvdev->usemem.memlength)) { 951 ret = -EOVERFLOW; 952 goto done; 953 } 954 955 /* 956 * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 957 * Calculate and save the BAR size for the region. 958 */ 959 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 960 961 /* 962 * If the hardware has the fix for MIG, there is no requirement 963 * for splitting the device memory to create RESMEM. The entire 964 * device memory is usable and will be USEMEM. Return here for 965 * such case. 966 */ 967 if (!nvdev->has_mig_hw_bug) 968 goto done; 969 970 /* 971 * When the device memory is split to workaround the MIG bug on 972 * Grace Hopper, the USEMEM part of the device memory has to be 973 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 974 * GPU FW and VFIO driver. The VM device driver is also aware of it 975 * and make use of the value for its calculation to determine USEMEM 976 * size. Note that the device memory may not be 512M aligned. 977 */ 978 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 979 MEMBLK_SIZE); 980 if (nvdev->usemem.memlength == 0) { 981 ret = -EINVAL; 982 goto done; 983 } 984 985 if ((check_add_overflow(nvdev->usemem.memphys, 986 nvdev->usemem.memlength, 987 &nvdev->resmem.memphys)) || 988 (check_sub_overflow(memlength, nvdev->usemem.memlength, 989 &nvdev->resmem.memlength))) { 990 ret = -EOVERFLOW; 991 goto done; 992 } 993 994 /* 995 * The resmem region is exposed as a 64b BAR composed of region 2 and 3 996 * for Grace Hopper. Calculate and save the BAR size for the region. 997 */ 998 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 999 done: 1000 return ret; 1001 } 1002 1003 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 1004 { 1005 int pcie_dvsec; 1006 u16 dvsec_ctrl16; 1007 1008 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 1009 GPU_CAP_DVSEC_REGISTER); 1010 1011 if (pcie_dvsec) { 1012 pci_read_config_word(pdev, 1013 pcie_dvsec + DVSEC_BITMAP_OFFSET, 1014 &dvsec_ctrl16); 1015 1016 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 1017 return false; 1018 } 1019 1020 return true; 1021 } 1022 1023 /* 1024 * To reduce the system bootup time, the HBM training has 1025 * been moved out of the UEFI on the Grace-Blackwell systems. 1026 * 1027 * The onus of checking whether the HBM training has completed 1028 * thus falls on the module. The HBM training status can be 1029 * determined from a BAR0 register. 1030 * 1031 * Similarly, another BAR0 register exposes the status of the 1032 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 1033 * 1034 * Poll these register and check for 30s. If the HBM training is 1035 * not complete or if the C2C link is not ready, fail the probe. 1036 * 1037 * While the wait is not required on Grace Hopper systems, it 1038 * is beneficial to make the check to ensure the device is in an 1039 * expected state. 1040 * 1041 * Ensure that the BAR0 region is enabled before accessing the 1042 * registers. 1043 */ 1044 static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) 1045 { 1046 void __iomem *io; 1047 int ret; 1048 1049 ret = pci_enable_device(pdev); 1050 if (ret) 1051 return ret; 1052 1053 ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 1054 if (ret) 1055 goto request_region_exit; 1056 1057 io = pci_iomap(pdev, 0, 0); 1058 if (!io) { 1059 ret = -ENOMEM; 1060 goto iomap_exit; 1061 } 1062 1063 ret = nvgrace_gpu_wait_device_ready(io); 1064 1065 pci_iounmap(pdev, io); 1066 iomap_exit: 1067 pci_release_selected_regions(pdev, 1 << 0); 1068 request_region_exit: 1069 pci_disable_device(pdev); 1070 return ret; 1071 } 1072 1073 static int nvgrace_gpu_probe(struct pci_dev *pdev, 1074 const struct pci_device_id *id) 1075 { 1076 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 1077 struct nvgrace_gpu_pci_core_device *nvdev; 1078 u64 memphys, memlength; 1079 int ret; 1080 1081 ret = nvgrace_gpu_probe_check_device_ready(pdev); 1082 if (ret) 1083 return ret; 1084 1085 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 1086 if (!ret) 1087 ops = &nvgrace_gpu_pci_ops; 1088 1089 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 1090 &pdev->dev, ops); 1091 if (IS_ERR(nvdev)) 1092 return PTR_ERR(nvdev); 1093 1094 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 1095 1096 if (ops == &nvgrace_gpu_pci_ops) { 1097 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 1098 1099 /* 1100 * Device memory properties are identified in the host ACPI 1101 * table. Set the nvgrace_gpu_pci_core_device structure. 1102 */ 1103 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 1104 memphys, memlength); 1105 if (ret) 1106 goto out_put_vdev; 1107 nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; 1108 } else { 1109 nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; 1110 } 1111 1112 ret = vfio_pci_core_register_device(&nvdev->core_device); 1113 if (ret) 1114 goto out_put_vdev; 1115 1116 return ret; 1117 1118 out_put_vdev: 1119 vfio_put_device(&nvdev->core_device.vdev); 1120 return ret; 1121 } 1122 1123 static void nvgrace_gpu_remove(struct pci_dev *pdev) 1124 { 1125 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 1126 1127 vfio_pci_core_unregister_device(core_device); 1128 vfio_put_device(&core_device->vdev); 1129 } 1130 1131 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 1132 /* GH200 120GB */ 1133 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 1134 /* GH200 480GB */ 1135 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 1136 /* GH200 SKU */ 1137 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 1138 /* GB200 SKU */ 1139 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 1140 /* GB300 SKU */ 1141 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) }, 1142 {} 1143 }; 1144 1145 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 1146 1147 /* 1148 * The GPU reset is required to be serialized against the *first* mapping 1149 * faults and read/writes accesses to prevent potential RAS events logging. 1150 * 1151 * First fault or access after a reset needs to poll device readiness, 1152 * flag that a reset has occurred. The readiness test is done by holding 1153 * the memory_lock read lock and we expect all vfio-pci initiated resets to 1154 * hold the memory_lock write lock to avoid races. However, .reset_done 1155 * extends beyond the scope of vfio-pci initiated resets therefore we 1156 * cannot assert this behavior and use lockdep_assert_held_write. 1157 */ 1158 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) 1159 { 1160 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 1161 struct nvgrace_gpu_pci_core_device *nvdev = 1162 container_of(core_device, struct nvgrace_gpu_pci_core_device, 1163 core_device); 1164 1165 nvdev->reset_done = true; 1166 } 1167 1168 static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { 1169 .reset_done = nvgrace_gpu_vfio_pci_reset_done, 1170 .error_detected = vfio_pci_core_aer_err_detected, 1171 }; 1172 1173 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 1174 .name = KBUILD_MODNAME, 1175 .id_table = nvgrace_gpu_vfio_pci_table, 1176 .probe = nvgrace_gpu_probe, 1177 .remove = nvgrace_gpu_remove, 1178 .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, 1179 .driver_managed_dma = true, 1180 }; 1181 1182 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 1183 1184 MODULE_LICENSE("GPL"); 1185 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 1186 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 1187 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 1188