1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/sizes.h> 7 #include <linux/vfio_pci_core.h> 8 #include <linux/delay.h> 9 #include <linux/jiffies.h> 10 11 /* 12 * The device memory usable to the workloads running in the VM is cached 13 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 14 * to the VM and is represented as usemem. 15 * Moreover, the VM GPU device driver needs a non-cacheable region to 16 * support the MIG feature. This region is also exposed as a 64b BAR 17 * (comprising of BAR2 and BAR3 region) and represented as resmem. 18 */ 19 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 20 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 21 22 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 23 #define MEMBLK_SIZE SZ_512M 24 25 #define DVSEC_BITMAP_OFFSET 0xA 26 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 27 28 #define GPU_CAP_DVSEC_REGISTER 3 29 30 #define C2C_LINK_BAR0_OFFSET 0x1498 31 #define HBM_TRAINING_BAR0_OFFSET 0x200BC 32 #define STATUS_READY 0xFF 33 34 #define POLL_QUANTUM_MS 1000 35 #define POLL_TIMEOUT_MS (30 * 1000) 36 37 /* 38 * The state of the two device memory region - resmem and usemem - is 39 * saved as struct mem_region. 40 */ 41 struct mem_region { 42 phys_addr_t memphys; /* Base physical address of the region */ 43 size_t memlength; /* Region size */ 44 size_t bar_size; /* Reported region BAR size */ 45 __le64 bar_val; /* Emulated BAR offset registers */ 46 union { 47 void *memaddr; 48 void __iomem *ioaddr; 49 }; /* Base virtual address of the region */ 50 }; 51 52 struct nvgrace_gpu_pci_core_device { 53 struct vfio_pci_core_device core_device; 54 /* Cached and usable memory for the VM. */ 55 struct mem_region usemem; 56 /* Non cached memory carved out from the end of device memory */ 57 struct mem_region resmem; 58 /* Lock to control device memory kernel mapping */ 59 struct mutex remap_lock; 60 bool has_mig_hw_bug; 61 }; 62 63 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 64 { 65 struct nvgrace_gpu_pci_core_device *nvdev = 66 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 67 core_device.vdev); 68 69 nvdev->resmem.bar_val = 0; 70 nvdev->usemem.bar_val = 0; 71 } 72 73 /* Choose the structure corresponding to the fake BAR with a given index. */ 74 static struct mem_region * 75 nvgrace_gpu_memregion(int index, 76 struct nvgrace_gpu_pci_core_device *nvdev) 77 { 78 if (index == USEMEM_REGION_INDEX) 79 return &nvdev->usemem; 80 81 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 82 return &nvdev->resmem; 83 84 return NULL; 85 } 86 87 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 88 { 89 struct vfio_pci_core_device *vdev = 90 container_of(core_vdev, struct vfio_pci_core_device, vdev); 91 struct nvgrace_gpu_pci_core_device *nvdev = 92 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 93 core_device.vdev); 94 int ret; 95 96 ret = vfio_pci_core_enable(vdev); 97 if (ret) 98 return ret; 99 100 if (nvdev->usemem.memlength) { 101 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 102 mutex_init(&nvdev->remap_lock); 103 } 104 105 vfio_pci_core_finish_enable(vdev); 106 107 return 0; 108 } 109 110 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 111 { 112 struct nvgrace_gpu_pci_core_device *nvdev = 113 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 114 core_device.vdev); 115 116 /* Unmap the mapping to the device memory cached region */ 117 if (nvdev->usemem.memaddr) { 118 memunmap(nvdev->usemem.memaddr); 119 nvdev->usemem.memaddr = NULL; 120 } 121 122 /* Unmap the mapping to the device memory non-cached region */ 123 if (nvdev->resmem.ioaddr) { 124 iounmap(nvdev->resmem.ioaddr); 125 nvdev->resmem.ioaddr = NULL; 126 } 127 128 mutex_destroy(&nvdev->remap_lock); 129 130 vfio_pci_core_close_device(core_vdev); 131 } 132 133 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 134 struct vm_area_struct *vma) 135 { 136 struct nvgrace_gpu_pci_core_device *nvdev = 137 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 138 core_device.vdev); 139 struct mem_region *memregion; 140 unsigned long start_pfn; 141 u64 req_len, pgoff, end; 142 unsigned int index; 143 int ret = 0; 144 145 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 146 147 memregion = nvgrace_gpu_memregion(index, nvdev); 148 if (!memregion) 149 return vfio_pci_core_mmap(core_vdev, vma); 150 151 /* 152 * Request to mmap the BAR. Map to the CPU accessible memory on the 153 * GPU using the memory information gathered from the system ACPI 154 * tables. 155 */ 156 pgoff = vma->vm_pgoff & 157 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 158 159 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 160 check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 161 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 162 return -EOVERFLOW; 163 164 /* 165 * Check that the mapping request does not go beyond available device 166 * memory size 167 */ 168 if (end > memregion->memlength) 169 return -EINVAL; 170 171 /* 172 * The carved out region of the device memory needs the NORMAL_NC 173 * property. Communicate as such to the hypervisor. 174 */ 175 if (index == RESMEM_REGION_INDEX) { 176 /* 177 * The nvgrace-gpu module has no issues with uncontained 178 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 179 * set to communicate to the KVM to S2 map as NORMAL_NC. 180 * This opens up guest usage of NORMAL_NC for this mapping. 181 */ 182 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 183 184 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 185 } 186 187 /* 188 * Perform a PFN map to the memory and back the device BAR by the 189 * GPU memory. 190 * 191 * The available GPU memory size may not be power-of-2 aligned. The 192 * remainder is only backed by vfio_device_ops read/write handlers. 193 * 194 * During device reset, the GPU is safely disconnected to the CPU 195 * and access to the BAR will be immediately returned preventing 196 * machine check. 197 */ 198 ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 199 req_len, vma->vm_page_prot); 200 if (ret) 201 return ret; 202 203 vma->vm_pgoff = start_pfn; 204 205 return 0; 206 } 207 208 static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 209 struct vfio_region_info *info, 210 struct vfio_info_cap *caps) 211 { 212 struct nvgrace_gpu_pci_core_device *nvdev = 213 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 214 core_device.vdev); 215 struct vfio_region_info_cap_sparse_mmap *sparse; 216 struct mem_region *memregion; 217 u32 size; 218 int ret; 219 220 /* 221 * Request to determine the BAR region information. Send the 222 * GPU memory information. 223 */ 224 memregion = nvgrace_gpu_memregion(info->index, nvdev); 225 if (!memregion) 226 return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); 227 228 size = struct_size(sparse, areas, 1); 229 230 /* 231 * Setup for sparse mapping for the device memory. Only the 232 * available device memory on the hardware is shown as a 233 * mappable region. 234 */ 235 sparse = kzalloc(size, GFP_KERNEL); 236 if (!sparse) 237 return -ENOMEM; 238 239 sparse->nr_areas = 1; 240 sparse->areas[0].offset = 0; 241 sparse->areas[0].size = memregion->memlength; 242 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 243 sparse->header.version = 1; 244 245 ret = vfio_info_add_capability(caps, &sparse->header, size); 246 kfree(sparse); 247 if (ret) 248 return ret; 249 250 info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); 251 /* 252 * The region memory size may not be power-of-2 aligned. 253 * Given that the memory is a BAR and may not be 254 * aligned, roundup to the next power-of-2. 255 */ 256 info->size = memregion->bar_size; 257 info->flags = VFIO_REGION_INFO_FLAG_READ | 258 VFIO_REGION_INFO_FLAG_WRITE | 259 VFIO_REGION_INFO_FLAG_MMAP; 260 return 0; 261 } 262 263 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 264 unsigned int cmd, unsigned long arg) 265 { 266 switch (cmd) { 267 case VFIO_DEVICE_IOEVENTFD: 268 return -ENOTTY; 269 case VFIO_DEVICE_RESET: 270 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 271 fallthrough; 272 default: 273 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 274 } 275 } 276 277 static __le64 278 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 279 { 280 u64 tmp_val; 281 282 tmp_val = le64_to_cpu(val64); 283 tmp_val &= ~(bar_size - 1); 284 tmp_val |= flags; 285 286 return cpu_to_le64(tmp_val); 287 } 288 289 /* 290 * Both the usable (usemem) and the reserved (resmem) device memory region 291 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 292 * respond to the accesses on their respective PCI config space offsets. 293 * 294 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 295 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 296 */ 297 static ssize_t 298 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 299 char __user *buf, size_t count, loff_t *ppos) 300 { 301 struct nvgrace_gpu_pci_core_device *nvdev = 302 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 303 core_device.vdev); 304 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 305 struct mem_region *memregion = NULL; 306 __le64 val64; 307 size_t register_offset; 308 loff_t copy_offset; 309 size_t copy_count; 310 int ret; 311 312 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 313 if (ret < 0) 314 return ret; 315 316 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 317 sizeof(val64), 318 ©_offset, ©_count, 319 ®ister_offset)) 320 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 321 else if (vfio_pci_core_range_intersect_range(pos, count, 322 PCI_BASE_ADDRESS_4, 323 sizeof(val64), 324 ©_offset, ©_count, 325 ®ister_offset)) 326 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 327 328 if (memregion) { 329 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 330 PCI_BASE_ADDRESS_MEM_TYPE_64 | 331 PCI_BASE_ADDRESS_MEM_PREFETCH, 332 memregion->bar_val); 333 if (copy_to_user(buf + copy_offset, 334 (void *)&val64 + register_offset, copy_count)) { 335 /* 336 * The position has been incremented in 337 * vfio_pci_core_read. Reset the offset back to the 338 * starting position. 339 */ 340 *ppos -= count; 341 return -EFAULT; 342 } 343 } 344 345 return count; 346 } 347 348 static ssize_t 349 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 350 const char __user *buf, size_t count, loff_t *ppos) 351 { 352 struct nvgrace_gpu_pci_core_device *nvdev = 353 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 354 core_device.vdev); 355 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 356 struct mem_region *memregion = NULL; 357 size_t register_offset; 358 loff_t copy_offset; 359 size_t copy_count; 360 361 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 362 sizeof(u64), ©_offset, 363 ©_count, ®ister_offset)) 364 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 365 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 366 sizeof(u64), ©_offset, 367 ©_count, ®ister_offset)) 368 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 369 370 if (memregion) { 371 if (copy_from_user((void *)&memregion->bar_val + register_offset, 372 buf + copy_offset, copy_count)) 373 return -EFAULT; 374 *ppos += copy_count; 375 return copy_count; 376 } 377 378 return vfio_pci_core_write(core_vdev, buf, count, ppos); 379 } 380 381 /* 382 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 383 * as vfio does not require the userspace driver to only perform accesses through 384 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 385 * vfio_device_ops read/write implementations. 386 * 387 * The usemem region is cacheable memory and hence is memremaped. 388 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 389 */ 390 static int 391 nvgrace_gpu_map_device_mem(int index, 392 struct nvgrace_gpu_pci_core_device *nvdev) 393 { 394 struct mem_region *memregion; 395 int ret = 0; 396 397 memregion = nvgrace_gpu_memregion(index, nvdev); 398 if (!memregion) 399 return -EINVAL; 400 401 mutex_lock(&nvdev->remap_lock); 402 403 if (memregion->memaddr) 404 goto unlock; 405 406 if (index == USEMEM_REGION_INDEX) 407 memregion->memaddr = memremap(memregion->memphys, 408 memregion->memlength, 409 MEMREMAP_WB); 410 else 411 memregion->ioaddr = ioremap_wc(memregion->memphys, 412 memregion->memlength); 413 414 if (!memregion->memaddr) 415 ret = -ENOMEM; 416 417 unlock: 418 mutex_unlock(&nvdev->remap_lock); 419 420 return ret; 421 } 422 423 /* 424 * Read the data from the device memory (mapped either through ioremap 425 * or memremap) into the user buffer. 426 */ 427 static int 428 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 429 char __user *buf, size_t mem_count, loff_t *ppos) 430 { 431 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 432 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 433 int ret; 434 435 if (!mem_count) 436 return 0; 437 438 /* 439 * Handle read on the BAR regions. Map to the target device memory 440 * physical address and copy to the request read buffer. 441 */ 442 ret = nvgrace_gpu_map_device_mem(index, nvdev); 443 if (ret) 444 return ret; 445 446 if (index == USEMEM_REGION_INDEX) { 447 if (copy_to_user(buf, 448 (u8 *)nvdev->usemem.memaddr + offset, 449 mem_count)) 450 ret = -EFAULT; 451 } else { 452 /* 453 * The hardware ensures that the system does not crash when 454 * the device memory is accessed with the memory enable 455 * turned off. It synthesizes ~0 on such read. So there is 456 * no need to check or support the disablement/enablement of 457 * BAR through PCI_COMMAND config space register. Pass 458 * test_mem flag as false. 459 */ 460 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 461 nvdev->resmem.ioaddr, 462 buf, offset, mem_count, 463 0, 0, false); 464 } 465 466 return ret; 467 } 468 469 /* 470 * Read count bytes from the device memory at an offset. The actual device 471 * memory size (available) may not be a power-of-2. So the driver fakes 472 * the size to a power-of-2 (reported) when exposing to a user space driver. 473 * 474 * Reads starting beyond the reported size generate -EINVAL; reads extending 475 * beyond the actual device size is filled with ~0; reads extending beyond 476 * the reported size are truncated. 477 */ 478 static ssize_t 479 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 480 char __user *buf, size_t count, loff_t *ppos) 481 { 482 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 483 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 484 struct mem_region *memregion; 485 size_t mem_count, i; 486 u8 val = 0xFF; 487 int ret; 488 489 /* No need to do NULL check as caller does. */ 490 memregion = nvgrace_gpu_memregion(index, nvdev); 491 492 if (offset >= memregion->bar_size) 493 return -EINVAL; 494 495 /* Clip short the read request beyond reported BAR size */ 496 count = min(count, memregion->bar_size - (size_t)offset); 497 498 /* 499 * Determine how many bytes to be actually read from the device memory. 500 * Read request beyond the actual device memory size is filled with ~0, 501 * while those beyond the actual reported size is skipped. 502 */ 503 if (offset >= memregion->memlength) 504 mem_count = 0; 505 else 506 mem_count = min(count, memregion->memlength - (size_t)offset); 507 508 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 509 if (ret) 510 return ret; 511 512 /* 513 * Only the device memory present on the hardware is mapped, which may 514 * not be power-of-2 aligned. A read to an offset beyond the device memory 515 * size is filled with ~0. 516 */ 517 for (i = mem_count; i < count; i++) { 518 ret = put_user(val, (unsigned char __user *)(buf + i)); 519 if (ret) 520 return ret; 521 } 522 523 *ppos += count; 524 return count; 525 } 526 527 static ssize_t 528 nvgrace_gpu_read(struct vfio_device *core_vdev, 529 char __user *buf, size_t count, loff_t *ppos) 530 { 531 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 532 struct nvgrace_gpu_pci_core_device *nvdev = 533 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 534 core_device.vdev); 535 536 if (nvgrace_gpu_memregion(index, nvdev)) 537 return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 538 539 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 540 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 541 542 return vfio_pci_core_read(core_vdev, buf, count, ppos); 543 } 544 545 /* 546 * Write the data to the device memory (mapped either through ioremap 547 * or memremap) from the user buffer. 548 */ 549 static int 550 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 551 const char __user *buf, size_t mem_count, 552 loff_t *ppos) 553 { 554 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 555 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 556 int ret; 557 558 if (!mem_count) 559 return 0; 560 561 ret = nvgrace_gpu_map_device_mem(index, nvdev); 562 if (ret) 563 return ret; 564 565 if (index == USEMEM_REGION_INDEX) { 566 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 567 buf, mem_count)) 568 return -EFAULT; 569 } else { 570 /* 571 * The hardware ensures that the system does not crash when 572 * the device memory is accessed with the memory enable 573 * turned off. It drops such writes. So there is no need to 574 * check or support the disablement/enablement of BAR 575 * through PCI_COMMAND config space register. Pass test_mem 576 * flag as false. 577 */ 578 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 579 nvdev->resmem.ioaddr, 580 (char __user *)buf, pos, mem_count, 581 0, 0, true); 582 } 583 584 return ret; 585 } 586 587 /* 588 * Write count bytes to the device memory at a given offset. The actual device 589 * memory size (available) may not be a power-of-2. So the driver fakes the 590 * size to a power-of-2 (reported) when exposing to a user space driver. 591 * 592 * Writes extending beyond the reported size are truncated; writes starting 593 * beyond the reported size generate -EINVAL. 594 */ 595 static ssize_t 596 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 597 size_t count, loff_t *ppos, const char __user *buf) 598 { 599 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 600 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 601 struct mem_region *memregion; 602 size_t mem_count; 603 int ret = 0; 604 605 /* No need to do NULL check as caller does. */ 606 memregion = nvgrace_gpu_memregion(index, nvdev); 607 608 if (offset >= memregion->bar_size) 609 return -EINVAL; 610 611 /* Clip short the write request beyond reported BAR size */ 612 count = min(count, memregion->bar_size - (size_t)offset); 613 614 /* 615 * Determine how many bytes to be actually written to the device memory. 616 * Do not write to the offset beyond available size. 617 */ 618 if (offset >= memregion->memlength) 619 goto exitfn; 620 621 /* 622 * Only the device memory present on the hardware is mapped, which may 623 * not be power-of-2 aligned. Drop access outside the available device 624 * memory on the hardware. 625 */ 626 mem_count = min(count, memregion->memlength - (size_t)offset); 627 628 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 629 if (ret) 630 return ret; 631 632 exitfn: 633 *ppos += count; 634 return count; 635 } 636 637 static ssize_t 638 nvgrace_gpu_write(struct vfio_device *core_vdev, 639 const char __user *buf, size_t count, loff_t *ppos) 640 { 641 struct nvgrace_gpu_pci_core_device *nvdev = 642 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 643 core_device.vdev); 644 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 645 646 if (nvgrace_gpu_memregion(index, nvdev)) 647 return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 648 649 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 650 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 651 652 return vfio_pci_core_write(core_vdev, buf, count, ppos); 653 } 654 655 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 656 .name = "nvgrace-gpu-vfio-pci", 657 .init = vfio_pci_core_init_dev, 658 .release = vfio_pci_core_release_dev, 659 .open_device = nvgrace_gpu_open_device, 660 .close_device = nvgrace_gpu_close_device, 661 .ioctl = nvgrace_gpu_ioctl, 662 .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, 663 .device_feature = vfio_pci_core_ioctl_feature, 664 .read = nvgrace_gpu_read, 665 .write = nvgrace_gpu_write, 666 .mmap = nvgrace_gpu_mmap, 667 .request = vfio_pci_core_request, 668 .match = vfio_pci_core_match, 669 .match_token_uuid = vfio_pci_core_match_token_uuid, 670 .bind_iommufd = vfio_iommufd_physical_bind, 671 .unbind_iommufd = vfio_iommufd_physical_unbind, 672 .attach_ioas = vfio_iommufd_physical_attach_ioas, 673 .detach_ioas = vfio_iommufd_physical_detach_ioas, 674 }; 675 676 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 677 .name = "nvgrace-gpu-vfio-pci-core", 678 .init = vfio_pci_core_init_dev, 679 .release = vfio_pci_core_release_dev, 680 .open_device = nvgrace_gpu_open_device, 681 .close_device = vfio_pci_core_close_device, 682 .ioctl = vfio_pci_core_ioctl, 683 .get_region_info_caps = vfio_pci_ioctl_get_region_info, 684 .device_feature = vfio_pci_core_ioctl_feature, 685 .read = vfio_pci_core_read, 686 .write = vfio_pci_core_write, 687 .mmap = vfio_pci_core_mmap, 688 .request = vfio_pci_core_request, 689 .match = vfio_pci_core_match, 690 .match_token_uuid = vfio_pci_core_match_token_uuid, 691 .bind_iommufd = vfio_iommufd_physical_bind, 692 .unbind_iommufd = vfio_iommufd_physical_unbind, 693 .attach_ioas = vfio_iommufd_physical_attach_ioas, 694 .detach_ioas = vfio_iommufd_physical_detach_ioas, 695 }; 696 697 static int 698 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 699 u64 *pmemphys, u64 *pmemlength) 700 { 701 int ret; 702 703 /* 704 * The memory information is present in the system ACPI tables as DSD 705 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 706 */ 707 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 708 pmemphys); 709 if (ret) 710 return ret; 711 712 if (*pmemphys > type_max(phys_addr_t)) 713 return -EOVERFLOW; 714 715 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 716 pmemlength); 717 if (ret) 718 return ret; 719 720 if (*pmemlength > type_max(size_t)) 721 return -EOVERFLOW; 722 723 /* 724 * If the C2C link is not up due to an error, the coherent device 725 * memory size is returned as 0. Fail in such case. 726 */ 727 if (*pmemlength == 0) 728 return -ENOMEM; 729 730 return ret; 731 } 732 733 static int 734 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 735 struct nvgrace_gpu_pci_core_device *nvdev, 736 u64 memphys, u64 memlength) 737 { 738 int ret = 0; 739 u64 resmem_size = 0; 740 741 /* 742 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 743 * region to support the MIG feature owing to a hardware bug. Since the 744 * device memory is mapped as NORMAL cached, carve out a region from the end 745 * with a different NORMAL_NC property (called as reserved memory and 746 * represented as resmem). This region then is exposed as a 64b BAR 747 * (region 2 and 3) to the VM, while exposing the rest (termed as usable 748 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 749 * 750 * devmem (memlength) 751 * |-------------------------------------------------| 752 * | | 753 * usemem.memphys resmem.memphys 754 * 755 * This hardware bug is fixed on the Grace Blackwell platforms and the 756 * presence of the bug can be determined through nvdev->has_mig_hw_bug. 757 * Thus on systems with the hardware fix, there is no need to partition 758 * the GPU device memory and the entire memory is usable and mapped as 759 * NORMAL cached (i.e. resmem size is 0). 760 */ 761 if (nvdev->has_mig_hw_bug) 762 resmem_size = SZ_1G; 763 764 nvdev->usemem.memphys = memphys; 765 766 /* 767 * The device memory exposed to the VM is added to the kernel by the 768 * VM driver module in chunks of memory block size. Note that only the 769 * usable memory (usemem) is added to the kernel for usage by the VM 770 * workloads. 771 */ 772 if (check_sub_overflow(memlength, resmem_size, 773 &nvdev->usemem.memlength)) { 774 ret = -EOVERFLOW; 775 goto done; 776 } 777 778 /* 779 * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 780 * Calculate and save the BAR size for the region. 781 */ 782 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 783 784 /* 785 * If the hardware has the fix for MIG, there is no requirement 786 * for splitting the device memory to create RESMEM. The entire 787 * device memory is usable and will be USEMEM. Return here for 788 * such case. 789 */ 790 if (!nvdev->has_mig_hw_bug) 791 goto done; 792 793 /* 794 * When the device memory is split to workaround the MIG bug on 795 * Grace Hopper, the USEMEM part of the device memory has to be 796 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 797 * GPU FW and VFIO driver. The VM device driver is also aware of it 798 * and make use of the value for its calculation to determine USEMEM 799 * size. Note that the device memory may not be 512M aligned. 800 */ 801 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 802 MEMBLK_SIZE); 803 if (nvdev->usemem.memlength == 0) { 804 ret = -EINVAL; 805 goto done; 806 } 807 808 if ((check_add_overflow(nvdev->usemem.memphys, 809 nvdev->usemem.memlength, 810 &nvdev->resmem.memphys)) || 811 (check_sub_overflow(memlength, nvdev->usemem.memlength, 812 &nvdev->resmem.memlength))) { 813 ret = -EOVERFLOW; 814 goto done; 815 } 816 817 /* 818 * The resmem region is exposed as a 64b BAR composed of region 2 and 3 819 * for Grace Hopper. Calculate and save the BAR size for the region. 820 */ 821 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 822 done: 823 return ret; 824 } 825 826 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 827 { 828 int pcie_dvsec; 829 u16 dvsec_ctrl16; 830 831 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 832 GPU_CAP_DVSEC_REGISTER); 833 834 if (pcie_dvsec) { 835 pci_read_config_word(pdev, 836 pcie_dvsec + DVSEC_BITMAP_OFFSET, 837 &dvsec_ctrl16); 838 839 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 840 return false; 841 } 842 843 return true; 844 } 845 846 /* 847 * To reduce the system bootup time, the HBM training has 848 * been moved out of the UEFI on the Grace-Blackwell systems. 849 * 850 * The onus of checking whether the HBM training has completed 851 * thus falls on the module. The HBM training status can be 852 * determined from a BAR0 register. 853 * 854 * Similarly, another BAR0 register exposes the status of the 855 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 856 * 857 * Poll these register and check for 30s. If the HBM training is 858 * not complete or if the C2C link is not ready, fail the probe. 859 * 860 * While the wait is not required on Grace Hopper systems, it 861 * is beneficial to make the check to ensure the device is in an 862 * expected state. 863 * 864 * Ensure that the BAR0 region is enabled before accessing the 865 * registers. 866 */ 867 static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) 868 { 869 unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 870 void __iomem *io; 871 int ret = -ETIME; 872 873 ret = pci_enable_device(pdev); 874 if (ret) 875 return ret; 876 877 ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 878 if (ret) 879 goto request_region_exit; 880 881 io = pci_iomap(pdev, 0, 0); 882 if (!io) { 883 ret = -ENOMEM; 884 goto iomap_exit; 885 } 886 887 do { 888 if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 889 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { 890 ret = 0; 891 goto reg_check_exit; 892 } 893 msleep(POLL_QUANTUM_MS); 894 } while (!time_after(jiffies, timeout)); 895 896 reg_check_exit: 897 pci_iounmap(pdev, io); 898 iomap_exit: 899 pci_release_selected_regions(pdev, 1 << 0); 900 request_region_exit: 901 pci_disable_device(pdev); 902 return ret; 903 } 904 905 static int nvgrace_gpu_probe(struct pci_dev *pdev, 906 const struct pci_device_id *id) 907 { 908 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 909 struct nvgrace_gpu_pci_core_device *nvdev; 910 u64 memphys, memlength; 911 int ret; 912 913 ret = nvgrace_gpu_wait_device_ready(pdev); 914 if (ret) 915 return ret; 916 917 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 918 if (!ret) 919 ops = &nvgrace_gpu_pci_ops; 920 921 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 922 &pdev->dev, ops); 923 if (IS_ERR(nvdev)) 924 return PTR_ERR(nvdev); 925 926 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 927 928 if (ops == &nvgrace_gpu_pci_ops) { 929 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 930 931 /* 932 * Device memory properties are identified in the host ACPI 933 * table. Set the nvgrace_gpu_pci_core_device structure. 934 */ 935 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 936 memphys, memlength); 937 if (ret) 938 goto out_put_vdev; 939 } 940 941 ret = vfio_pci_core_register_device(&nvdev->core_device); 942 if (ret) 943 goto out_put_vdev; 944 945 return ret; 946 947 out_put_vdev: 948 vfio_put_device(&nvdev->core_device.vdev); 949 return ret; 950 } 951 952 static void nvgrace_gpu_remove(struct pci_dev *pdev) 953 { 954 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 955 956 vfio_pci_core_unregister_device(core_device); 957 vfio_put_device(&core_device->vdev); 958 } 959 960 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 961 /* GH200 120GB */ 962 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 963 /* GH200 480GB */ 964 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 965 /* GH200 SKU */ 966 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 967 /* GB200 SKU */ 968 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 969 /* GB300 SKU */ 970 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) }, 971 {} 972 }; 973 974 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 975 976 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 977 .name = KBUILD_MODNAME, 978 .id_table = nvgrace_gpu_vfio_pci_table, 979 .probe = nvgrace_gpu_probe, 980 .remove = nvgrace_gpu_remove, 981 .err_handler = &vfio_pci_core_err_handlers, 982 .driver_managed_dma = true, 983 }; 984 985 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 986 987 MODULE_LICENSE("GPL"); 988 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 989 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 990 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 991