1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/sizes.h> 7 #include <linux/vfio_pci_core.h> 8 #include <linux/delay.h> 9 #include <linux/jiffies.h> 10 11 /* 12 * The device memory usable to the workloads running in the VM is cached 13 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 14 * to the VM and is represented as usemem. 15 * Moreover, the VM GPU device driver needs a non-cacheable region to 16 * support the MIG feature. This region is also exposed as a 64b BAR 17 * (comprising of BAR2 and BAR3 region) and represented as resmem. 18 */ 19 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 20 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 21 22 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 23 #define MEMBLK_SIZE SZ_512M 24 25 #define DVSEC_BITMAP_OFFSET 0xA 26 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 27 28 #define GPU_CAP_DVSEC_REGISTER 3 29 30 #define C2C_LINK_BAR0_OFFSET 0x1498 31 #define HBM_TRAINING_BAR0_OFFSET 0x200BC 32 #define STATUS_READY 0xFF 33 34 #define POLL_QUANTUM_MS 1000 35 #define POLL_TIMEOUT_MS (30 * 1000) 36 37 /* 38 * The state of the two device memory region - resmem and usemem - is 39 * saved as struct mem_region. 40 */ 41 struct mem_region { 42 phys_addr_t memphys; /* Base physical address of the region */ 43 size_t memlength; /* Region size */ 44 size_t bar_size; /* Reported region BAR size */ 45 __le64 bar_val; /* Emulated BAR offset registers */ 46 union { 47 void *memaddr; 48 void __iomem *ioaddr; 49 }; /* Base virtual address of the region */ 50 }; 51 52 struct nvgrace_gpu_pci_core_device { 53 struct vfio_pci_core_device core_device; 54 /* Cached and usable memory for the VM. */ 55 struct mem_region usemem; 56 /* Non cached memory carved out from the end of device memory */ 57 struct mem_region resmem; 58 /* Lock to control device memory kernel mapping */ 59 struct mutex remap_lock; 60 bool has_mig_hw_bug; 61 }; 62 63 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 64 { 65 struct nvgrace_gpu_pci_core_device *nvdev = 66 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 67 core_device.vdev); 68 69 nvdev->resmem.bar_val = 0; 70 nvdev->usemem.bar_val = 0; 71 } 72 73 /* Choose the structure corresponding to the fake BAR with a given index. */ 74 static struct mem_region * 75 nvgrace_gpu_memregion(int index, 76 struct nvgrace_gpu_pci_core_device *nvdev) 77 { 78 if (index == USEMEM_REGION_INDEX) 79 return &nvdev->usemem; 80 81 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 82 return &nvdev->resmem; 83 84 return NULL; 85 } 86 87 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 88 { 89 struct vfio_pci_core_device *vdev = 90 container_of(core_vdev, struct vfio_pci_core_device, vdev); 91 struct nvgrace_gpu_pci_core_device *nvdev = 92 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 93 core_device.vdev); 94 int ret; 95 96 ret = vfio_pci_core_enable(vdev); 97 if (ret) 98 return ret; 99 100 if (nvdev->usemem.memlength) { 101 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 102 mutex_init(&nvdev->remap_lock); 103 } 104 105 vfio_pci_core_finish_enable(vdev); 106 107 return 0; 108 } 109 110 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 111 { 112 struct nvgrace_gpu_pci_core_device *nvdev = 113 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 114 core_device.vdev); 115 116 /* Unmap the mapping to the device memory cached region */ 117 if (nvdev->usemem.memaddr) { 118 memunmap(nvdev->usemem.memaddr); 119 nvdev->usemem.memaddr = NULL; 120 } 121 122 /* Unmap the mapping to the device memory non-cached region */ 123 if (nvdev->resmem.ioaddr) { 124 iounmap(nvdev->resmem.ioaddr); 125 nvdev->resmem.ioaddr = NULL; 126 } 127 128 mutex_destroy(&nvdev->remap_lock); 129 130 vfio_pci_core_close_device(core_vdev); 131 } 132 133 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 134 struct vm_area_struct *vma) 135 { 136 struct nvgrace_gpu_pci_core_device *nvdev = 137 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 138 core_device.vdev); 139 struct mem_region *memregion; 140 unsigned long start_pfn; 141 u64 req_len, pgoff, end; 142 unsigned int index; 143 int ret = 0; 144 145 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 146 147 memregion = nvgrace_gpu_memregion(index, nvdev); 148 if (!memregion) 149 return vfio_pci_core_mmap(core_vdev, vma); 150 151 /* 152 * Request to mmap the BAR. Map to the CPU accessible memory on the 153 * GPU using the memory information gathered from the system ACPI 154 * tables. 155 */ 156 pgoff = vma->vm_pgoff & 157 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 158 159 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 160 check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 161 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 162 return -EOVERFLOW; 163 164 /* 165 * Check that the mapping request does not go beyond available device 166 * memory size 167 */ 168 if (end > memregion->memlength) 169 return -EINVAL; 170 171 /* 172 * The carved out region of the device memory needs the NORMAL_NC 173 * property. Communicate as such to the hypervisor. 174 */ 175 if (index == RESMEM_REGION_INDEX) { 176 /* 177 * The nvgrace-gpu module has no issues with uncontained 178 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 179 * set to communicate to the KVM to S2 map as NORMAL_NC. 180 * This opens up guest usage of NORMAL_NC for this mapping. 181 */ 182 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 183 184 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 185 } 186 187 /* 188 * Perform a PFN map to the memory and back the device BAR by the 189 * GPU memory. 190 * 191 * The available GPU memory size may not be power-of-2 aligned. The 192 * remainder is only backed by vfio_device_ops read/write handlers. 193 * 194 * During device reset, the GPU is safely disconnected to the CPU 195 * and access to the BAR will be immediately returned preventing 196 * machine check. 197 */ 198 ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 199 req_len, vma->vm_page_prot); 200 if (ret) 201 return ret; 202 203 vma->vm_pgoff = start_pfn; 204 205 return 0; 206 } 207 208 static long 209 nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 210 unsigned long arg) 211 { 212 struct nvgrace_gpu_pci_core_device *nvdev = 213 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 214 core_device.vdev); 215 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 216 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 217 struct vfio_region_info_cap_sparse_mmap *sparse; 218 struct vfio_region_info info; 219 struct mem_region *memregion; 220 u32 size; 221 int ret; 222 223 if (copy_from_user(&info, (void __user *)arg, minsz)) 224 return -EFAULT; 225 226 if (info.argsz < minsz) 227 return -EINVAL; 228 229 /* 230 * Request to determine the BAR region information. Send the 231 * GPU memory information. 232 */ 233 memregion = nvgrace_gpu_memregion(info.index, nvdev); 234 if (!memregion) 235 return vfio_pci_core_ioctl(core_vdev, 236 VFIO_DEVICE_GET_REGION_INFO, arg); 237 238 size = struct_size(sparse, areas, 1); 239 240 /* 241 * Setup for sparse mapping for the device memory. Only the 242 * available device memory on the hardware is shown as a 243 * mappable region. 244 */ 245 sparse = kzalloc(size, GFP_KERNEL); 246 if (!sparse) 247 return -ENOMEM; 248 249 sparse->nr_areas = 1; 250 sparse->areas[0].offset = 0; 251 sparse->areas[0].size = memregion->memlength; 252 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 253 sparse->header.version = 1; 254 255 ret = vfio_info_add_capability(&caps, &sparse->header, size); 256 kfree(sparse); 257 if (ret) 258 return ret; 259 260 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 261 /* 262 * The region memory size may not be power-of-2 aligned. 263 * Given that the memory as a BAR and may not be 264 * aligned, roundup to the next power-of-2. 265 */ 266 info.size = memregion->bar_size; 267 info.flags = VFIO_REGION_INFO_FLAG_READ | 268 VFIO_REGION_INFO_FLAG_WRITE | 269 VFIO_REGION_INFO_FLAG_MMAP; 270 271 if (caps.size) { 272 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 273 if (info.argsz < sizeof(info) + caps.size) { 274 info.argsz = sizeof(info) + caps.size; 275 info.cap_offset = 0; 276 } else { 277 vfio_info_cap_shift(&caps, sizeof(info)); 278 if (copy_to_user((void __user *)arg + 279 sizeof(info), caps.buf, 280 caps.size)) { 281 kfree(caps.buf); 282 return -EFAULT; 283 } 284 info.cap_offset = sizeof(info); 285 } 286 kfree(caps.buf); 287 } 288 return copy_to_user((void __user *)arg, &info, minsz) ? 289 -EFAULT : 0; 290 } 291 292 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 293 unsigned int cmd, unsigned long arg) 294 { 295 switch (cmd) { 296 case VFIO_DEVICE_GET_REGION_INFO: 297 return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); 298 case VFIO_DEVICE_IOEVENTFD: 299 return -ENOTTY; 300 case VFIO_DEVICE_RESET: 301 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 302 fallthrough; 303 default: 304 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 305 } 306 } 307 308 static __le64 309 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 310 { 311 u64 tmp_val; 312 313 tmp_val = le64_to_cpu(val64); 314 tmp_val &= ~(bar_size - 1); 315 tmp_val |= flags; 316 317 return cpu_to_le64(tmp_val); 318 } 319 320 /* 321 * Both the usable (usemem) and the reserved (resmem) device memory region 322 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 323 * respond to the accesses on their respective PCI config space offsets. 324 * 325 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 326 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 327 */ 328 static ssize_t 329 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 330 char __user *buf, size_t count, loff_t *ppos) 331 { 332 struct nvgrace_gpu_pci_core_device *nvdev = 333 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 334 core_device.vdev); 335 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 336 struct mem_region *memregion = NULL; 337 __le64 val64; 338 size_t register_offset; 339 loff_t copy_offset; 340 size_t copy_count; 341 int ret; 342 343 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 344 if (ret < 0) 345 return ret; 346 347 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 348 sizeof(val64), 349 ©_offset, ©_count, 350 ®ister_offset)) 351 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 352 else if (vfio_pci_core_range_intersect_range(pos, count, 353 PCI_BASE_ADDRESS_4, 354 sizeof(val64), 355 ©_offset, ©_count, 356 ®ister_offset)) 357 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 358 359 if (memregion) { 360 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 361 PCI_BASE_ADDRESS_MEM_TYPE_64 | 362 PCI_BASE_ADDRESS_MEM_PREFETCH, 363 memregion->bar_val); 364 if (copy_to_user(buf + copy_offset, 365 (void *)&val64 + register_offset, copy_count)) { 366 /* 367 * The position has been incremented in 368 * vfio_pci_core_read. Reset the offset back to the 369 * starting position. 370 */ 371 *ppos -= count; 372 return -EFAULT; 373 } 374 } 375 376 return count; 377 } 378 379 static ssize_t 380 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 381 const char __user *buf, size_t count, loff_t *ppos) 382 { 383 struct nvgrace_gpu_pci_core_device *nvdev = 384 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 385 core_device.vdev); 386 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 387 struct mem_region *memregion = NULL; 388 size_t register_offset; 389 loff_t copy_offset; 390 size_t copy_count; 391 392 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 393 sizeof(u64), ©_offset, 394 ©_count, ®ister_offset)) 395 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 396 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 397 sizeof(u64), ©_offset, 398 ©_count, ®ister_offset)) 399 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 400 401 if (memregion) { 402 if (copy_from_user((void *)&memregion->bar_val + register_offset, 403 buf + copy_offset, copy_count)) 404 return -EFAULT; 405 *ppos += copy_count; 406 return copy_count; 407 } 408 409 return vfio_pci_core_write(core_vdev, buf, count, ppos); 410 } 411 412 /* 413 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 414 * as vfio does not require the userspace driver to only perform accesses through 415 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 416 * vfio_device_ops read/write implementations. 417 * 418 * The usemem region is cacheable memory and hence is memremaped. 419 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 420 */ 421 static int 422 nvgrace_gpu_map_device_mem(int index, 423 struct nvgrace_gpu_pci_core_device *nvdev) 424 { 425 struct mem_region *memregion; 426 int ret = 0; 427 428 memregion = nvgrace_gpu_memregion(index, nvdev); 429 if (!memregion) 430 return -EINVAL; 431 432 mutex_lock(&nvdev->remap_lock); 433 434 if (memregion->memaddr) 435 goto unlock; 436 437 if (index == USEMEM_REGION_INDEX) 438 memregion->memaddr = memremap(memregion->memphys, 439 memregion->memlength, 440 MEMREMAP_WB); 441 else 442 memregion->ioaddr = ioremap_wc(memregion->memphys, 443 memregion->memlength); 444 445 if (!memregion->memaddr) 446 ret = -ENOMEM; 447 448 unlock: 449 mutex_unlock(&nvdev->remap_lock); 450 451 return ret; 452 } 453 454 /* 455 * Read the data from the device memory (mapped either through ioremap 456 * or memremap) into the user buffer. 457 */ 458 static int 459 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 460 char __user *buf, size_t mem_count, loff_t *ppos) 461 { 462 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 463 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 464 int ret; 465 466 if (!mem_count) 467 return 0; 468 469 /* 470 * Handle read on the BAR regions. Map to the target device memory 471 * physical address and copy to the request read buffer. 472 */ 473 ret = nvgrace_gpu_map_device_mem(index, nvdev); 474 if (ret) 475 return ret; 476 477 if (index == USEMEM_REGION_INDEX) { 478 if (copy_to_user(buf, 479 (u8 *)nvdev->usemem.memaddr + offset, 480 mem_count)) 481 ret = -EFAULT; 482 } else { 483 /* 484 * The hardware ensures that the system does not crash when 485 * the device memory is accessed with the memory enable 486 * turned off. It synthesizes ~0 on such read. So there is 487 * no need to check or support the disablement/enablement of 488 * BAR through PCI_COMMAND config space register. Pass 489 * test_mem flag as false. 490 */ 491 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 492 nvdev->resmem.ioaddr, 493 buf, offset, mem_count, 494 0, 0, false); 495 } 496 497 return ret; 498 } 499 500 /* 501 * Read count bytes from the device memory at an offset. The actual device 502 * memory size (available) may not be a power-of-2. So the driver fakes 503 * the size to a power-of-2 (reported) when exposing to a user space driver. 504 * 505 * Reads starting beyond the reported size generate -EINVAL; reads extending 506 * beyond the actual device size is filled with ~0; reads extending beyond 507 * the reported size are truncated. 508 */ 509 static ssize_t 510 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 511 char __user *buf, size_t count, loff_t *ppos) 512 { 513 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 514 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 515 struct mem_region *memregion; 516 size_t mem_count, i; 517 u8 val = 0xFF; 518 int ret; 519 520 /* No need to do NULL check as caller does. */ 521 memregion = nvgrace_gpu_memregion(index, nvdev); 522 523 if (offset >= memregion->bar_size) 524 return -EINVAL; 525 526 /* Clip short the read request beyond reported BAR size */ 527 count = min(count, memregion->bar_size - (size_t)offset); 528 529 /* 530 * Determine how many bytes to be actually read from the device memory. 531 * Read request beyond the actual device memory size is filled with ~0, 532 * while those beyond the actual reported size is skipped. 533 */ 534 if (offset >= memregion->memlength) 535 mem_count = 0; 536 else 537 mem_count = min(count, memregion->memlength - (size_t)offset); 538 539 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 540 if (ret) 541 return ret; 542 543 /* 544 * Only the device memory present on the hardware is mapped, which may 545 * not be power-of-2 aligned. A read to an offset beyond the device memory 546 * size is filled with ~0. 547 */ 548 for (i = mem_count; i < count; i++) { 549 ret = put_user(val, (unsigned char __user *)(buf + i)); 550 if (ret) 551 return ret; 552 } 553 554 *ppos += count; 555 return count; 556 } 557 558 static ssize_t 559 nvgrace_gpu_read(struct vfio_device *core_vdev, 560 char __user *buf, size_t count, loff_t *ppos) 561 { 562 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 563 struct nvgrace_gpu_pci_core_device *nvdev = 564 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 565 core_device.vdev); 566 567 if (nvgrace_gpu_memregion(index, nvdev)) 568 return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 569 570 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 571 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 572 573 return vfio_pci_core_read(core_vdev, buf, count, ppos); 574 } 575 576 /* 577 * Write the data to the device memory (mapped either through ioremap 578 * or memremap) from the user buffer. 579 */ 580 static int 581 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 582 const char __user *buf, size_t mem_count, 583 loff_t *ppos) 584 { 585 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 586 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 587 int ret; 588 589 if (!mem_count) 590 return 0; 591 592 ret = nvgrace_gpu_map_device_mem(index, nvdev); 593 if (ret) 594 return ret; 595 596 if (index == USEMEM_REGION_INDEX) { 597 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 598 buf, mem_count)) 599 return -EFAULT; 600 } else { 601 /* 602 * The hardware ensures that the system does not crash when 603 * the device memory is accessed with the memory enable 604 * turned off. It drops such writes. So there is no need to 605 * check or support the disablement/enablement of BAR 606 * through PCI_COMMAND config space register. Pass test_mem 607 * flag as false. 608 */ 609 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 610 nvdev->resmem.ioaddr, 611 (char __user *)buf, pos, mem_count, 612 0, 0, true); 613 } 614 615 return ret; 616 } 617 618 /* 619 * Write count bytes to the device memory at a given offset. The actual device 620 * memory size (available) may not be a power-of-2. So the driver fakes the 621 * size to a power-of-2 (reported) when exposing to a user space driver. 622 * 623 * Writes extending beyond the reported size are truncated; writes starting 624 * beyond the reported size generate -EINVAL. 625 */ 626 static ssize_t 627 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 628 size_t count, loff_t *ppos, const char __user *buf) 629 { 630 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 631 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 632 struct mem_region *memregion; 633 size_t mem_count; 634 int ret = 0; 635 636 /* No need to do NULL check as caller does. */ 637 memregion = nvgrace_gpu_memregion(index, nvdev); 638 639 if (offset >= memregion->bar_size) 640 return -EINVAL; 641 642 /* Clip short the write request beyond reported BAR size */ 643 count = min(count, memregion->bar_size - (size_t)offset); 644 645 /* 646 * Determine how many bytes to be actually written to the device memory. 647 * Do not write to the offset beyond available size. 648 */ 649 if (offset >= memregion->memlength) 650 goto exitfn; 651 652 /* 653 * Only the device memory present on the hardware is mapped, which may 654 * not be power-of-2 aligned. Drop access outside the available device 655 * memory on the hardware. 656 */ 657 mem_count = min(count, memregion->memlength - (size_t)offset); 658 659 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 660 if (ret) 661 return ret; 662 663 exitfn: 664 *ppos += count; 665 return count; 666 } 667 668 static ssize_t 669 nvgrace_gpu_write(struct vfio_device *core_vdev, 670 const char __user *buf, size_t count, loff_t *ppos) 671 { 672 struct nvgrace_gpu_pci_core_device *nvdev = 673 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 674 core_device.vdev); 675 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 676 677 if (nvgrace_gpu_memregion(index, nvdev)) 678 return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 679 680 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 681 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 682 683 return vfio_pci_core_write(core_vdev, buf, count, ppos); 684 } 685 686 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 687 .name = "nvgrace-gpu-vfio-pci", 688 .init = vfio_pci_core_init_dev, 689 .release = vfio_pci_core_release_dev, 690 .open_device = nvgrace_gpu_open_device, 691 .close_device = nvgrace_gpu_close_device, 692 .ioctl = nvgrace_gpu_ioctl, 693 .device_feature = vfio_pci_core_ioctl_feature, 694 .read = nvgrace_gpu_read, 695 .write = nvgrace_gpu_write, 696 .mmap = nvgrace_gpu_mmap, 697 .request = vfio_pci_core_request, 698 .match = vfio_pci_core_match, 699 .match_token_uuid = vfio_pci_core_match_token_uuid, 700 .bind_iommufd = vfio_iommufd_physical_bind, 701 .unbind_iommufd = vfio_iommufd_physical_unbind, 702 .attach_ioas = vfio_iommufd_physical_attach_ioas, 703 .detach_ioas = vfio_iommufd_physical_detach_ioas, 704 }; 705 706 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 707 .name = "nvgrace-gpu-vfio-pci-core", 708 .init = vfio_pci_core_init_dev, 709 .release = vfio_pci_core_release_dev, 710 .open_device = nvgrace_gpu_open_device, 711 .close_device = vfio_pci_core_close_device, 712 .ioctl = vfio_pci_core_ioctl, 713 .device_feature = vfio_pci_core_ioctl_feature, 714 .read = vfio_pci_core_read, 715 .write = vfio_pci_core_write, 716 .mmap = vfio_pci_core_mmap, 717 .request = vfio_pci_core_request, 718 .match = vfio_pci_core_match, 719 .match_token_uuid = vfio_pci_core_match_token_uuid, 720 .bind_iommufd = vfio_iommufd_physical_bind, 721 .unbind_iommufd = vfio_iommufd_physical_unbind, 722 .attach_ioas = vfio_iommufd_physical_attach_ioas, 723 .detach_ioas = vfio_iommufd_physical_detach_ioas, 724 }; 725 726 static int 727 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 728 u64 *pmemphys, u64 *pmemlength) 729 { 730 int ret; 731 732 /* 733 * The memory information is present in the system ACPI tables as DSD 734 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 735 */ 736 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 737 pmemphys); 738 if (ret) 739 return ret; 740 741 if (*pmemphys > type_max(phys_addr_t)) 742 return -EOVERFLOW; 743 744 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 745 pmemlength); 746 if (ret) 747 return ret; 748 749 if (*pmemlength > type_max(size_t)) 750 return -EOVERFLOW; 751 752 /* 753 * If the C2C link is not up due to an error, the coherent device 754 * memory size is returned as 0. Fail in such case. 755 */ 756 if (*pmemlength == 0) 757 return -ENOMEM; 758 759 return ret; 760 } 761 762 static int 763 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 764 struct nvgrace_gpu_pci_core_device *nvdev, 765 u64 memphys, u64 memlength) 766 { 767 int ret = 0; 768 u64 resmem_size = 0; 769 770 /* 771 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 772 * region to support the MIG feature owing to a hardware bug. Since the 773 * device memory is mapped as NORMAL cached, carve out a region from the end 774 * with a different NORMAL_NC property (called as reserved memory and 775 * represented as resmem). This region then is exposed as a 64b BAR 776 * (region 2 and 3) to the VM, while exposing the rest (termed as usable 777 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 778 * 779 * devmem (memlength) 780 * |-------------------------------------------------| 781 * | | 782 * usemem.memphys resmem.memphys 783 * 784 * This hardware bug is fixed on the Grace Blackwell platforms and the 785 * presence of the bug can be determined through nvdev->has_mig_hw_bug. 786 * Thus on systems with the hardware fix, there is no need to partition 787 * the GPU device memory and the entire memory is usable and mapped as 788 * NORMAL cached (i.e. resmem size is 0). 789 */ 790 if (nvdev->has_mig_hw_bug) 791 resmem_size = SZ_1G; 792 793 nvdev->usemem.memphys = memphys; 794 795 /* 796 * The device memory exposed to the VM is added to the kernel by the 797 * VM driver module in chunks of memory block size. Note that only the 798 * usable memory (usemem) is added to the kernel for usage by the VM 799 * workloads. 800 */ 801 if (check_sub_overflow(memlength, resmem_size, 802 &nvdev->usemem.memlength)) { 803 ret = -EOVERFLOW; 804 goto done; 805 } 806 807 /* 808 * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 809 * Calculate and save the BAR size for the region. 810 */ 811 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 812 813 /* 814 * If the hardware has the fix for MIG, there is no requirement 815 * for splitting the device memory to create RESMEM. The entire 816 * device memory is usable and will be USEMEM. Return here for 817 * such case. 818 */ 819 if (!nvdev->has_mig_hw_bug) 820 goto done; 821 822 /* 823 * When the device memory is split to workaround the MIG bug on 824 * Grace Hopper, the USEMEM part of the device memory has to be 825 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 826 * GPU FW and VFIO driver. The VM device driver is also aware of it 827 * and make use of the value for its calculation to determine USEMEM 828 * size. Note that the device memory may not be 512M aligned. 829 */ 830 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 831 MEMBLK_SIZE); 832 if (nvdev->usemem.memlength == 0) { 833 ret = -EINVAL; 834 goto done; 835 } 836 837 if ((check_add_overflow(nvdev->usemem.memphys, 838 nvdev->usemem.memlength, 839 &nvdev->resmem.memphys)) || 840 (check_sub_overflow(memlength, nvdev->usemem.memlength, 841 &nvdev->resmem.memlength))) { 842 ret = -EOVERFLOW; 843 goto done; 844 } 845 846 /* 847 * The resmem region is exposed as a 64b BAR composed of region 2 and 3 848 * for Grace Hopper. Calculate and save the BAR size for the region. 849 */ 850 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 851 done: 852 return ret; 853 } 854 855 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 856 { 857 int pcie_dvsec; 858 u16 dvsec_ctrl16; 859 860 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 861 GPU_CAP_DVSEC_REGISTER); 862 863 if (pcie_dvsec) { 864 pci_read_config_word(pdev, 865 pcie_dvsec + DVSEC_BITMAP_OFFSET, 866 &dvsec_ctrl16); 867 868 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 869 return false; 870 } 871 872 return true; 873 } 874 875 /* 876 * To reduce the system bootup time, the HBM training has 877 * been moved out of the UEFI on the Grace-Blackwell systems. 878 * 879 * The onus of checking whether the HBM training has completed 880 * thus falls on the module. The HBM training status can be 881 * determined from a BAR0 register. 882 * 883 * Similarly, another BAR0 register exposes the status of the 884 * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 885 * 886 * Poll these register and check for 30s. If the HBM training is 887 * not complete or if the C2C link is not ready, fail the probe. 888 * 889 * While the wait is not required on Grace Hopper systems, it 890 * is beneficial to make the check to ensure the device is in an 891 * expected state. 892 * 893 * Ensure that the BAR0 region is enabled before accessing the 894 * registers. 895 */ 896 static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) 897 { 898 unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 899 void __iomem *io; 900 int ret = -ETIME; 901 902 ret = pci_enable_device(pdev); 903 if (ret) 904 return ret; 905 906 ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 907 if (ret) 908 goto request_region_exit; 909 910 io = pci_iomap(pdev, 0, 0); 911 if (!io) { 912 ret = -ENOMEM; 913 goto iomap_exit; 914 } 915 916 do { 917 if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 918 (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { 919 ret = 0; 920 goto reg_check_exit; 921 } 922 msleep(POLL_QUANTUM_MS); 923 } while (!time_after(jiffies, timeout)); 924 925 reg_check_exit: 926 pci_iounmap(pdev, io); 927 iomap_exit: 928 pci_release_selected_regions(pdev, 1 << 0); 929 request_region_exit: 930 pci_disable_device(pdev); 931 return ret; 932 } 933 934 static int nvgrace_gpu_probe(struct pci_dev *pdev, 935 const struct pci_device_id *id) 936 { 937 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 938 struct nvgrace_gpu_pci_core_device *nvdev; 939 u64 memphys, memlength; 940 int ret; 941 942 ret = nvgrace_gpu_wait_device_ready(pdev); 943 if (ret) 944 return ret; 945 946 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 947 if (!ret) 948 ops = &nvgrace_gpu_pci_ops; 949 950 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 951 &pdev->dev, ops); 952 if (IS_ERR(nvdev)) 953 return PTR_ERR(nvdev); 954 955 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 956 957 if (ops == &nvgrace_gpu_pci_ops) { 958 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 959 960 /* 961 * Device memory properties are identified in the host ACPI 962 * table. Set the nvgrace_gpu_pci_core_device structure. 963 */ 964 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 965 memphys, memlength); 966 if (ret) 967 goto out_put_vdev; 968 } 969 970 ret = vfio_pci_core_register_device(&nvdev->core_device); 971 if (ret) 972 goto out_put_vdev; 973 974 return ret; 975 976 out_put_vdev: 977 vfio_put_device(&nvdev->core_device.vdev); 978 return ret; 979 } 980 981 static void nvgrace_gpu_remove(struct pci_dev *pdev) 982 { 983 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 984 985 vfio_pci_core_unregister_device(core_device); 986 vfio_put_device(&core_device->vdev); 987 } 988 989 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 990 /* GH200 120GB */ 991 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 992 /* GH200 480GB */ 993 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 994 /* GH200 SKU */ 995 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 996 /* GB200 SKU */ 997 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 998 {} 999 }; 1000 1001 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 1002 1003 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 1004 .name = KBUILD_MODNAME, 1005 .id_table = nvgrace_gpu_vfio_pci_table, 1006 .probe = nvgrace_gpu_probe, 1007 .remove = nvgrace_gpu_remove, 1008 .err_handler = &vfio_pci_core_err_handlers, 1009 .driver_managed_dma = true, 1010 }; 1011 1012 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 1013 1014 MODULE_LICENSE("GPL"); 1015 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 1016 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 1017 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 1018