1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/sizes.h> 7 #include <linux/vfio_pci_core.h> 8 #include <linux/delay.h> 9 #include <linux/jiffies.h> 10 11 /* 12 * The device memory usable to the workloads running in the VM is cached 13 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 14 * to the VM and is represented as usemem. 15 * Moreover, the VM GPU device driver needs a non-cacheable region to 16 * support the MIG feature. This region is also exposed as a 64b BAR 17 * (comprising of BAR2 and BAR3 region) and represented as resmem. 18 */ 19 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 20 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 21 22 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 23 #define MEMBLK_SIZE SZ_512M 24 25 #define DVSEC_BITMAP_OFFSET 0xA 26 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 27 28 #define GPU_CAP_DVSEC_REGISTER 3 29 30 #define C2C_LINK_BAR0_OFFSET 0x1498 31 #define HBM_TRAINING_BAR0_OFFSET 0x200BC 32 #define STATUS_READY 0xFF 33 34 #define POLL_QUANTUM_MS 1000 35 #define POLL_TIMEOUT_MS (30 * 1000) 36 37 /* 38 * The state of the two device memory region - resmem and usemem - is 39 * saved as struct mem_region. 40 */ 41 struct mem_region { 42 phys_addr_t memphys; /* Base physical address of the region */ 43 size_t memlength; /* Region size */ 44 size_t bar_size; /* Reported region BAR size */ 45 __le64 bar_val; /* Emulated BAR offset registers */ 46 union { 47 void *memaddr; 48 void __iomem *ioaddr; 49 }; /* Base virtual address of the region */ 50 }; 51 52 struct nvgrace_gpu_pci_core_device { 53 struct vfio_pci_core_device core_device; 54 /* Cached and usable memory for the VM. */ 55 struct mem_region usemem; 56 /* Non cached memory carved out from the end of device memory */ 57 struct mem_region resmem; 58 /* Lock to control device memory kernel mapping */ 59 struct mutex remap_lock; 60 bool has_mig_hw_bug; 61 }; 62 63 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 64 { 65 struct nvgrace_gpu_pci_core_device *nvdev = 66 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 67 core_device.vdev); 68 69 nvdev->resmem.bar_val = 0; 70 nvdev->usemem.bar_val = 0; 71 } 72 73 /* Choose the structure corresponding to the fake BAR with a given index. */ 74 static struct mem_region * 75 nvgrace_gpu_memregion(int index, 76 struct nvgrace_gpu_pci_core_device *nvdev) 77 { 78 if (index == USEMEM_REGION_INDEX) 79 return &nvdev->usemem; 80 81 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 82 return &nvdev->resmem; 83 84 return NULL; 85 } 86 87 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 88 { 89 struct vfio_pci_core_device *vdev = 90 container_of(core_vdev, struct vfio_pci_core_device, vdev); 91 struct nvgrace_gpu_pci_core_device *nvdev = 92 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 93 core_device.vdev); 94 int ret; 95 96 ret = vfio_pci_core_enable(vdev); 97 if (ret) 98 return ret; 99 100 if (nvdev->usemem.memlength) { 101 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 102 mutex_init(&nvdev->remap_lock); 103 } 104 105 vfio_pci_core_finish_enable(vdev); 106 107 return 0; 108 } 109 110 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 111 { 112 struct nvgrace_gpu_pci_core_device *nvdev = 113 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 114 core_device.vdev); 115 116 /* Unmap the mapping to the device memory cached region */ 117 if (nvdev->usemem.memaddr) { 118 memunmap(nvdev->usemem.memaddr); 119 nvdev->usemem.memaddr = NULL; 120 } 121 122 /* Unmap the mapping to the device memory non-cached region */ 123 if (nvdev->resmem.ioaddr) { 124 iounmap(nvdev->resmem.ioaddr); 125 nvdev->resmem.ioaddr = NULL; 126 } 127 128 mutex_destroy(&nvdev->remap_lock); 129 130 vfio_pci_core_close_device(core_vdev); 131 } 132 133 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 134 struct vm_area_struct *vma) 135 { 136 struct nvgrace_gpu_pci_core_device *nvdev = 137 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 138 core_device.vdev); 139 struct mem_region *memregion; 140 unsigned long start_pfn; 141 u64 req_len, pgoff, end; 142 unsigned int index; 143 int ret = 0; 144 145 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 146 147 memregion = nvgrace_gpu_memregion(index, nvdev); 148 if (!memregion) 149 return vfio_pci_core_mmap(core_vdev, vma); 150 151 /* 152 * Request to mmap the BAR. Map to the CPU accessible memory on the 153 * GPU using the memory information gathered from the system ACPI 154 * tables. 155 */ 156 pgoff = vma->vm_pgoff & 157 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 158 159 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 160 check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 161 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 162 return -EOVERFLOW; 163 164 /* 165 * Check that the mapping request does not go beyond available device 166 * memory size 167 */ 168 if (end > memregion->memlength) 169 return -EINVAL; 170 171 /* 172 * The carved out region of the device memory needs the NORMAL_NC 173 * property. Communicate as such to the hypervisor. 174 */ 175 if (index == RESMEM_REGION_INDEX) { 176 /* 177 * The nvgrace-gpu module has no issues with uncontained 178 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 179 * set to communicate to the KVM to S2 map as NORMAL_NC. 180 * This opens up guest usage of NORMAL_NC for this mapping. 181 */ 182 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 183 184 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 185 } 186 187 /* 188 * Perform a PFN map to the memory and back the device BAR by the 189 * GPU memory. 190 * 191 * The available GPU memory size may not be power-of-2 aligned. The 192 * remainder is only backed by vfio_device_ops read/write handlers. 193 * 194 * During device reset, the GPU is safely disconnected to the CPU 195 * and access to the BAR will be immediately returned preventing 196 * machine check. 197 */ 198 ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 199 req_len, vma->vm_page_prot); 200 if (ret) 201 return ret; 202 203 vma->vm_pgoff = start_pfn; 204 205 return 0; 206 } 207 208 static long 209 nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 210 unsigned long arg) 211 { 212 struct nvgrace_gpu_pci_core_device *nvdev = 213 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 214 core_device.vdev); 215 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 216 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 217 struct vfio_region_info_cap_sparse_mmap *sparse; 218 struct vfio_region_info info; 219 struct mem_region *memregion; 220 u32 size; 221 int ret; 222 223 if (copy_from_user(&info, (void __user *)arg, minsz)) 224 return -EFAULT; 225 226 if (info.argsz < minsz) 227 return -EINVAL; 228 229 /* 230 * Request to determine the BAR region information. Send the 231 * GPU memory information. 232 */ 233 memregion = nvgrace_gpu_memregion(info.index, nvdev); 234 if (!memregion) 235 return vfio_pci_core_ioctl(core_vdev, 236 VFIO_DEVICE_GET_REGION_INFO, arg); 237 238 size = struct_size(sparse, areas, 1); 239 240 /* 241 * Setup for sparse mapping for the device memory. Only the 242 * available device memory on the hardware is shown as a 243 * mappable region. 244 */ 245 sparse = kzalloc(size, GFP_KERNEL); 246 if (!sparse) 247 return -ENOMEM; 248 249 sparse->nr_areas = 1; 250 sparse->areas[0].offset = 0; 251 sparse->areas[0].size = memregion->memlength; 252 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 253 sparse->header.version = 1; 254 255 ret = vfio_info_add_capability(&caps, &sparse->header, size); 256 kfree(sparse); 257 if (ret) 258 return ret; 259 260 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 261 /* 262 * The region memory size may not be power-of-2 aligned. 263 * Given that the memory as a BAR and may not be 264 * aligned, roundup to the next power-of-2. 265 */ 266 info.size = memregion->bar_size; 267 info.flags = VFIO_REGION_INFO_FLAG_READ | 268 VFIO_REGION_INFO_FLAG_WRITE | 269 VFIO_REGION_INFO_FLAG_MMAP; 270 271 if (caps.size) { 272 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 273 if (info.argsz < sizeof(info) + caps.size) { 274 info.argsz = sizeof(info) + caps.size; 275 info.cap_offset = 0; 276 } else { 277 vfio_info_cap_shift(&caps, sizeof(info)); 278 if (copy_to_user((void __user *)arg + 279 sizeof(info), caps.buf, 280 caps.size)) { 281 kfree(caps.buf); 282 return -EFAULT; 283 } 284 info.cap_offset = sizeof(info); 285 } 286 kfree(caps.buf); 287 } 288 return copy_to_user((void __user *)arg, &info, minsz) ? 289 -EFAULT : 0; 290 } 291 292 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 293 unsigned int cmd, unsigned long arg) 294 { 295 switch (cmd) { 296 case VFIO_DEVICE_GET_REGION_INFO: 297 return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); 298 case VFIO_DEVICE_IOEVENTFD: 299 return -ENOTTY; 300 case VFIO_DEVICE_RESET: 301 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 302 fallthrough; 303 default: 304 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 305 } 306 } 307 308 static __le64 309 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 310 { 311 u64 tmp_val; 312 313 tmp_val = le64_to_cpu(val64); 314 tmp_val &= ~(bar_size - 1); 315 tmp_val |= flags; 316 317 return cpu_to_le64(tmp_val); 318 } 319 320 /* 321 * Both the usable (usemem) and the reserved (resmem) device memory region 322 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 323 * respond to the accesses on their respective PCI config space offsets. 324 * 325 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 326 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 327 */ 328 static ssize_t 329 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 330 char __user *buf, size_t count, loff_t *ppos) 331 { 332 struct nvgrace_gpu_pci_core_device *nvdev = 333 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 334 core_device.vdev); 335 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 336 struct mem_region *memregion = NULL; 337 __le64 val64; 338 size_t register_offset; 339 loff_t copy_offset; 340 size_t copy_count; 341 int ret; 342 343 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 344 if (ret < 0) 345 return ret; 346 347 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 348 sizeof(val64), 349 ©_offset, ©_count, 350 ®ister_offset)) 351 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 352 else if (vfio_pci_core_range_intersect_range(pos, count, 353 PCI_BASE_ADDRESS_4, 354 sizeof(val64), 355 ©_offset, ©_count, 356 ®ister_offset)) 357 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 358 359 if (memregion) { 360 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 361 PCI_BASE_ADDRESS_MEM_TYPE_64 | 362 PCI_BASE_ADDRESS_MEM_PREFETCH, 363 memregion->bar_val); 364 if (copy_to_user(buf + copy_offset, 365 (void *)&val64 + register_offset, copy_count)) { 366 /* 367 * The position has been incremented in 368 * vfio_pci_core_read. Reset the offset back to the 369 * starting position. 370 */ 371 *ppos -= count; 372 return -EFAULT; 373 } 374 } 375 376 return count; 377 } 378 379 static ssize_t 380 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 381 const char __user *buf, size_t count, loff_t *ppos) 382 { 383 struct nvgrace_gpu_pci_core_device *nvdev = 384 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 385 core_device.vdev); 386 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 387 struct mem_region *memregion = NULL; 388 size_t register_offset; 389 loff_t copy_offset; 390 size_t copy_count; 391 392 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 393 sizeof(u64), ©_offset, 394 ©_count, ®ister_offset)) 395 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 396 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 397 sizeof(u64), ©_offset, 398 ©_count, ®ister_offset)) 399 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 400 401 if (memregion) { 402 if (copy_from_user((void *)&memregion->bar_val + register_offset, 403 buf + copy_offset, copy_count)) 404 return -EFAULT; 405 *ppos += copy_count; 406 return copy_count; 407 } 408 409 return vfio_pci_core_write(core_vdev, buf, count, ppos); 410 } 411 412 /* 413 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 414 * as vfio does not require the userspace driver to only perform accesses through 415 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 416 * vfio_device_ops read/write implementations. 417 * 418 * The usemem region is cacheable memory and hence is memremaped. 419 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 420 */ 421 static int 422 nvgrace_gpu_map_device_mem(int index, 423 struct nvgrace_gpu_pci_core_device *nvdev) 424 { 425 struct mem_region *memregion; 426 int ret = 0; 427 428 memregion = nvgrace_gpu_memregion(index, nvdev); 429 if (!memregion) 430 return -EINVAL; 431 432 mutex_lock(&nvdev->remap_lock); 433 434 if (memregion->memaddr) 435 goto unlock; 436 437 if (index == USEMEM_REGION_INDEX) 438 memregion->memaddr = memremap(memregion->memphys, 439 memregion->memlength, 440 MEMREMAP_WB); 441 else 442 memregion->ioaddr = ioremap_wc(memregion->memphys, 443 memregion->memlength); 444 445 if (!memregion->memaddr) 446 ret = -ENOMEM; 447 448 unlock: 449 mutex_unlock(&nvdev->remap_lock); 450 451 return ret; 452 } 453 454 /* 455 * Read the data from the device memory (mapped either through ioremap 456 * or memremap) into the user buffer. 457 */ 458 static int 459 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 460 char __user *buf, size_t mem_count, loff_t *ppos) 461 { 462 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 463 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 464 int ret; 465 466 if (!mem_count) 467 return 0; 468 469 /* 470 * Handle read on the BAR regions. Map to the target device memory 471 * physical address and copy to the request read buffer. 472 */ 473 ret = nvgrace_gpu_map_device_mem(index, nvdev); 474 if (ret) 475 return ret; 476 477 if (index == USEMEM_REGION_INDEX) { 478 if (copy_to_user(buf, 479 (u8 *)nvdev->usemem.memaddr + offset, 480 mem_count)) 481 ret = -EFAULT; 482 } else { 483 /* 484 * The hardware ensures that the system does not crash when 485 * the device memory is accessed with the memory enable 486 * turned off. It synthesizes ~0 on such read. So there is 487 * no need to check or support the disablement/enablement of 488 * BAR through PCI_COMMAND config space register. Pass 489 * test_mem flag as false. 490 */ 491 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 492 nvdev->resmem.ioaddr, 493 buf, offset, mem_count, 494 0, 0, false); 495 } 496 497 return ret; 498 } 499 500 /* 501 * Read count bytes from the device memory at an offset. The actual device 502 * memory size (available) may not be a power-of-2. So the driver fakes 503 * the size to a power-of-2 (reported) when exposing to a user space driver. 504 * 505 * Reads starting beyond the reported size generate -EINVAL; reads extending 506 * beyond the actual device size is filled with ~0; reads extending beyond 507 * the reported size are truncated. 508 */ 509 static ssize_t 510 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 511 char __user *buf, size_t count, loff_t *ppos) 512 { 513 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 514 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 515 struct mem_region *memregion; 516 size_t mem_count, i; 517 u8 val = 0xFF; 518 int ret; 519 520 /* No need to do NULL check as caller does. */ 521 memregion = nvgrace_gpu_memregion(index, nvdev); 522 523 if (offset >= memregion->bar_size) 524 return -EINVAL; 525 526 /* Clip short the read request beyond reported BAR size */ 527 count = min(count, memregion->bar_size - (size_t)offset); 528 529 /* 530 * Determine how many bytes to be actually read from the device memory. 531 * Read request beyond the actual device memory size is filled with ~0, 532 * while those beyond the actual reported size is skipped. 533 */ 534 if (offset >= memregion->memlength) 535 mem_count = 0; 536 else 537 mem_count = min(count, memregion->memlength - (size_t)offset); 538 539 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 540 if (ret) 541 return ret; 542 543 /* 544 * Only the device memory present on the hardware is mapped, which may 545 * not be power-of-2 aligned. A read to an offset beyond the device memory 546 * size is filled with ~0. 547 */ 548 for (i = mem_count; i < count; i++) { 549 ret = put_user(val, (unsigned char __user *)(buf + i)); 550 if (ret) 551 return ret; 552 } 553 554 *ppos += count; 555 return count; 556 } 557 558 static ssize_t 559 nvgrace_gpu_read(struct vfio_device *core_vdev, 560 char __user *buf, size_t count, loff_t *ppos) 561 { 562 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 563 struct nvgrace_gpu_pci_core_device *nvdev = 564 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 565 core_device.vdev); 566 567 if (nvgrace_gpu_memregion(index, nvdev)) 568 return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 569 570 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 571 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 572 573 return vfio_pci_core_read(core_vdev, buf, count, ppos); 574 } 575 576 /* 577 * Write the data to the device memory (mapped either through ioremap 578 * or memremap) from the user buffer. 579 */ 580 static int 581 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 582 const char __user *buf, size_t mem_count, 583 loff_t *ppos) 584 { 585 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 586 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 587 int ret; 588 589 if (!mem_count) 590 return 0; 591 592 ret = nvgrace_gpu_map_device_mem(index, nvdev); 593 if (ret) 594 return ret; 595 596 if (index == USEMEM_REGION_INDEX) { 597 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 598 buf, mem_count)) 599 return -EFAULT; 600 } else { 601 /* 602 * The hardware ensures that the system does not crash when 603 * the device memory is accessed with the memory enable 604 * turned off. It drops such writes. So there is no need to 605 * check or support the disablement/enablement of BAR 606 * through PCI_COMMAND config space register. Pass test_mem 607 * flag as false. 608 */ 609 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 610 nvdev->resmem.ioaddr, 611 (char __user *)buf, pos, mem_count, 612 0, 0, true); 613 } 614 615 return ret; 616 } 617 618 /* 619 * Write count bytes to the device memory at a given offset. The actual device 620 * memory size (available) may not be a power-of-2. So the driver fakes the 621 * size to a power-of-2 (reported) when exposing to a user space driver. 622 * 623 * Writes extending beyond the reported size are truncated; writes starting 624 * beyond the reported size generate -EINVAL. 625 */ 626 static ssize_t 627 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 628 size_t count, loff_t *ppos, const char __user *buf) 629 { 630 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 631 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 632 struct mem_region *memregion; 633 size_t mem_count; 634 int ret = 0; 635 636 /* No need to do NULL check as caller does. */ 637 memregion = nvgrace_gpu_memregion(index, nvdev); 638 639 if (offset >= memregion->bar_size) 640 return -EINVAL; 641 642 /* Clip short the write request beyond reported BAR size */ 643 count = min(count, memregion->bar_size - (size_t)offset); 644 645 /* 646 * Determine how many bytes to be actually written to the device memory. 647 * Do not write to the offset beyond available size. 648 */ 649 if (offset >= memregion->memlength) 650 goto exitfn; 651 652 /* 653 * Only the device memory present on the hardware is mapped, which may 654 * not be power-of-2 aligned. Drop access outside the available device 655 * memory on the hardware. 656 */ 657 mem_count = min(count, memregion->memlength - (size_t)offset); 658 659 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 660 if (ret) 661 return ret; 662 663 exitfn: 664 *ppos += count; 665 return count; 666 } 667 668 static ssize_t 669 nvgrace_gpu_write(struct vfio_device *core_vdev, 670 const char __user *buf, size_t count, loff_t *ppos) 671 { 672 struct nvgrace_gpu_pci_core_device *nvdev = 673 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 674 core_device.vdev); 675 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 676 677 if (nvgrace_gpu_memregion(index, nvdev)) 678 return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 679 680 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 681 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 682 683 return vfio_pci_core_write(core_vdev, buf, count, ppos); 684 } 685 686 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 687 .name = "nvgrace-gpu-vfio-pci", 688 .init = vfio_pci_core_init_dev, 689 .release = vfio_pci_core_release_dev, 690 .open_device = nvgrace_gpu_open_device, 691 .close_device = nvgrace_gpu_close_device, 692 .ioctl = nvgrace_gpu_ioctl, 693 .device_feature = vfio_pci_core_ioctl_feature, 694 .read = nvgrace_gpu_read, 695 .write = nvgrace_gpu_write, 696 .mmap = nvgrace_gpu_mmap, 697 .request = vfio_pci_core_request, 698 .match = vfio_pci_core_match, 699 .bind_iommufd = vfio_iommufd_physical_bind, 700 .unbind_iommufd = vfio_iommufd_physical_unbind, 701 .attach_ioas = vfio_iommufd_physical_attach_ioas, 702 .detach_ioas = vfio_iommufd_physical_detach_ioas, 703 }; 704 705 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 706 .name = "nvgrace-gpu-vfio-pci-core", 707 .init = vfio_pci_core_init_dev, 708 .release = vfio_pci_core_release_dev, 709 .open_device = nvgrace_gpu_open_device, 710 .close_device = vfio_pci_core_close_device, 711 .ioctl = vfio_pci_core_ioctl, 712 .device_feature = vfio_pci_core_ioctl_feature, 713 .read = vfio_pci_core_read, 714 .write = vfio_pci_core_write, 715 .mmap = vfio_pci_core_mmap, 716 .request = vfio_pci_core_request, 717 .match = vfio_pci_core_match, 718 .bind_iommufd = vfio_iommufd_physical_bind, 719 .unbind_iommufd = vfio_iommufd_physical_unbind, 720 .attach_ioas = vfio_iommufd_physical_attach_ioas, 721 .detach_ioas = vfio_iommufd_physical_detach_ioas, 722 }; 723 724 static int 725 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 726 u64 *pmemphys, u64 *pmemlength) 727 { 728 int ret; 729 730 /* 731 * The memory information is present in the system ACPI tables as DSD 732 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 733 */ 734 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 735 pmemphys); 736 if (ret) 737 return ret; 738 739 if (*pmemphys > type_max(phys_addr_t)) 740 return -EOVERFLOW; 741 742 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 743 pmemlength); 744 if (ret) 745 return ret; 746 747 if (*pmemlength > type_max(size_t)) 748 return -EOVERFLOW; 749 750 /* 751 * If the C2C link is not up due to an error, the coherent device 752 * memory size is returned as 0. Fail in such case. 753 */ 754 if (*pmemlength == 0) 755 return -ENOMEM; 756 757 return ret; 758 } 759 760 static int 761 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 762 struct nvgrace_gpu_pci_core_device *nvdev, 763 u64 memphys, u64 memlength) 764 { 765 int ret = 0; 766 u64 resmem_size = 0; 767 768 /* 769 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 770 * region to support the MIG feature owing to a hardware bug. Since the 771 * device memory is mapped as NORMAL cached, carve out a region from the end 772 * with a different NORMAL_NC property (called as reserved memory and 773 * represented as resmem). This region then is exposed as a 64b BAR 774 * (region 2 and 3) to the VM, while exposing the rest (termed as usable 775 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 776 * 777 * devmem (memlength) 778 * |-------------------------------------------------| 779 * | | 780 * usemem.memphys resmem.memphys 781 * 782 * This hardware bug is fixed on the Grace Blackwell platforms and the 783 * presence of the bug can be determined through nvdev->has_mig_hw_bug. 784 * Thus on systems with the hardware fix, there is no need to partition 785 * the GPU device memory and the entire memory is usable and mapped as 786 * NORMAL cached (i.e. resmem size is 0). 787 */ 788 if (nvdev->has_mig_hw_bug) 789 resmem_size = SZ_1G; 790 791 nvdev->usemem.memphys = memphys; 792 793 /* 794 * The device memory exposed to the VM is added to the kernel by the 795 * VM driver module in chunks of memory block size. Note that only the 796 * usable memory (usemem) is added to the kernel for usage by the VM 797 * workloads. 798 */ 799 if (check_sub_overflow(memlength, resmem_size, 800 &nvdev->usemem.memlength)) { 801 ret = -EOVERFLOW; 802 goto done; 803 } 804 805 /* 806 * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 807 * Calculate and save the BAR size for the region. 808 */ 809 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 810 811 /* 812 * If the hardware has the fix for MIG, there is no requirement 813 * for splitting the device memory to create RESMEM. The entire 814 * device memory is usable and will be USEMEM. Return here for 815 * such case. 816 */ 817 if (!nvdev->has_mig_hw_bug) 818 goto done; 819 820 /* 821 * When the device memory is split to workaround the MIG bug on 822 * Grace Hopper, the USEMEM part of the device memory has to be 823 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 824 * GPU FW and VFIO driver. The VM device driver is also aware of it 825 * and make use of the value for its calculation to determine USEMEM 826 * size. Note that the device memory may not be 512M aligned. 827 */ 828 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 829 MEMBLK_SIZE); 830 if (nvdev->usemem.memlength == 0) { 831 ret = -EINVAL; 832 goto done; 833 } 834 835 if ((check_add_overflow(nvdev->usemem.memphys, 836 nvdev->usemem.memlength, 837 &nvdev->resmem.memphys)) || 838 (check_sub_overflow(memlength, nvdev->usemem.memlength, 839 &nvdev->resmem.memlength))) { 840 ret = -EOVERFLOW; 841 goto done; 842 } 843 844 /* 845 * The resmem region is exposed as a 64b BAR composed of region 2 and 3 846 * for Grace Hopper. Calculate and save the BAR size for the region. 847 */ 848 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 849 done: 850 return ret; 851 } 852 853 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 854 { 855 int pcie_dvsec; 856 u16 dvsec_ctrl16; 857 858 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 859 GPU_CAP_DVSEC_REGISTER); 860 861 if (pcie_dvsec) { 862 pci_read_config_word(pdev, 863 pcie_dvsec + DVSEC_BITMAP_OFFSET, 864 &dvsec_ctrl16); 865 866 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 867 return false; 868 } 869 870 return true; 871 } 872 873 /* 874 * To reduce the system bootup time, the HBM training has 875 * been moved out of the UEFI on the Grace-Blackwell systems. 876 * 877 * The onus of checking whether the HBM training has completed 878 * thus falls on the module. The HBM training status can be 879 * determined from a BAR0 register. 880 * 881 * Similarly, another BAR0 register exposes the status of the 882 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 883 * 884 * Poll these register and check for 30s. If the HBM training is 885 * not complete or if the C2C link is not ready, fail the probe. 886 * 887 * While the wait is not required on Grace Hopper systems, it 888 * is beneficial to make the check to ensure the device is in an 889 * expected state. 890 * 891 * Ensure that the BAR0 region is enabled before accessing the 892 * registers. 893 */ 894 static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) 895 { 896 unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 897 void __iomem *io; 898 int ret = -ETIME; 899 900 ret = pci_enable_device(pdev); 901 if (ret) 902 return ret; 903 904 ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 905 if (ret) 906 goto request_region_exit; 907 908 io = pci_iomap(pdev, 0, 0); 909 if (!io) { 910 ret = -ENOMEM; 911 goto iomap_exit; 912 } 913 914 do { 915 if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 916 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { 917 ret = 0; 918 goto reg_check_exit; 919 } 920 msleep(POLL_QUANTUM_MS); 921 } while (!time_after(jiffies, timeout)); 922 923 reg_check_exit: 924 pci_iounmap(pdev, io); 925 iomap_exit: 926 pci_release_selected_regions(pdev, 1 << 0); 927 request_region_exit: 928 pci_disable_device(pdev); 929 return ret; 930 } 931 932 static int nvgrace_gpu_probe(struct pci_dev *pdev, 933 const struct pci_device_id *id) 934 { 935 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 936 struct nvgrace_gpu_pci_core_device *nvdev; 937 u64 memphys, memlength; 938 int ret; 939 940 ret = nvgrace_gpu_wait_device_ready(pdev); 941 if (ret) 942 return ret; 943 944 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 945 if (!ret) 946 ops = &nvgrace_gpu_pci_ops; 947 948 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 949 &pdev->dev, ops); 950 if (IS_ERR(nvdev)) 951 return PTR_ERR(nvdev); 952 953 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 954 955 if (ops == &nvgrace_gpu_pci_ops) { 956 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 957 958 /* 959 * Device memory properties are identified in the host ACPI 960 * table. Set the nvgrace_gpu_pci_core_device structure. 961 */ 962 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 963 memphys, memlength); 964 if (ret) 965 goto out_put_vdev; 966 } 967 968 ret = vfio_pci_core_register_device(&nvdev->core_device); 969 if (ret) 970 goto out_put_vdev; 971 972 return ret; 973 974 out_put_vdev: 975 vfio_put_device(&nvdev->core_device.vdev); 976 return ret; 977 } 978 979 static void nvgrace_gpu_remove(struct pci_dev *pdev) 980 { 981 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 982 983 vfio_pci_core_unregister_device(core_device); 984 vfio_put_device(&core_device->vdev); 985 } 986 987 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 988 /* GH200 120GB */ 989 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 990 /* GH200 480GB */ 991 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 992 /* GH200 SKU */ 993 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 994 /* GB200 SKU */ 995 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 996 {} 997 }; 998 999 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 1000 1001 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 1002 .name = KBUILD_MODNAME, 1003 .id_table = nvgrace_gpu_vfio_pci_table, 1004 .probe = nvgrace_gpu_probe, 1005 .remove = nvgrace_gpu_remove, 1006 .err_handler = &vfio_pci_core_err_handlers, 1007 .driver_managed_dma = true, 1008 }; 1009 1010 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 1011 1012 MODULE_LICENSE("GPL"); 1013 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 1014 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 1015 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 1016