1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/sizes.h> 7 #include <linux/vfio_pci_core.h> 8 9 /* 10 * The device memory usable to the workloads running in the VM is cached 11 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 12 * to the VM and is represented as usemem. 13 * Moreover, the VM GPU device driver needs a non-cacheable region to 14 * support the MIG feature. This region is also exposed as a 64b BAR 15 * (comprising of BAR2 and BAR3 region) and represented as resmem. 16 */ 17 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 18 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 19 20 /* Memory size expected as non cached and reserved by the VM driver */ 21 #define RESMEM_SIZE SZ_1G 22 23 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 24 #define MEMBLK_SIZE SZ_512M 25 26 /* 27 * The state of the two device memory region - resmem and usemem - is 28 * saved as struct mem_region. 29 */ 30 struct mem_region { 31 phys_addr_t memphys; /* Base physical address of the region */ 32 size_t memlength; /* Region size */ 33 size_t bar_size; /* Reported region BAR size */ 34 __le64 bar_val; /* Emulated BAR offset registers */ 35 union { 36 void *memaddr; 37 void __iomem *ioaddr; 38 }; /* Base virtual address of the region */ 39 }; 40 41 struct nvgrace_gpu_pci_core_device { 42 struct vfio_pci_core_device core_device; 43 /* Cached and usable memory for the VM. */ 44 struct mem_region usemem; 45 /* Non cached memory carved out from the end of device memory */ 46 struct mem_region resmem; 47 /* Lock to control device memory kernel mapping */ 48 struct mutex remap_lock; 49 }; 50 51 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 52 { 53 struct nvgrace_gpu_pci_core_device *nvdev = 54 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 55 core_device.vdev); 56 57 nvdev->resmem.bar_val = 0; 58 nvdev->usemem.bar_val = 0; 59 } 60 61 /* Choose the structure corresponding to the fake BAR with a given index. */ 62 static struct mem_region * 63 nvgrace_gpu_memregion(int index, 64 struct nvgrace_gpu_pci_core_device *nvdev) 65 { 66 if (index == USEMEM_REGION_INDEX) 67 return &nvdev->usemem; 68 69 if (index == RESMEM_REGION_INDEX) 70 return &nvdev->resmem; 71 72 return NULL; 73 } 74 75 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 76 { 77 struct vfio_pci_core_device *vdev = 78 container_of(core_vdev, struct vfio_pci_core_device, vdev); 79 struct nvgrace_gpu_pci_core_device *nvdev = 80 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 81 core_device.vdev); 82 int ret; 83 84 ret = vfio_pci_core_enable(vdev); 85 if (ret) 86 return ret; 87 88 if (nvdev->usemem.memlength) { 89 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 90 mutex_init(&nvdev->remap_lock); 91 } 92 93 vfio_pci_core_finish_enable(vdev); 94 95 return 0; 96 } 97 98 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 99 { 100 struct nvgrace_gpu_pci_core_device *nvdev = 101 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 102 core_device.vdev); 103 104 /* Unmap the mapping to the device memory cached region */ 105 if (nvdev->usemem.memaddr) { 106 memunmap(nvdev->usemem.memaddr); 107 nvdev->usemem.memaddr = NULL; 108 } 109 110 /* Unmap the mapping to the device memory non-cached region */ 111 if (nvdev->resmem.ioaddr) { 112 iounmap(nvdev->resmem.ioaddr); 113 nvdev->resmem.ioaddr = NULL; 114 } 115 116 mutex_destroy(&nvdev->remap_lock); 117 118 vfio_pci_core_close_device(core_vdev); 119 } 120 121 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 122 struct vm_area_struct *vma) 123 { 124 struct nvgrace_gpu_pci_core_device *nvdev = 125 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 126 core_device.vdev); 127 struct mem_region *memregion; 128 unsigned long start_pfn; 129 u64 req_len, pgoff, end; 130 unsigned int index; 131 int ret = 0; 132 133 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 134 135 memregion = nvgrace_gpu_memregion(index, nvdev); 136 if (!memregion) 137 return vfio_pci_core_mmap(core_vdev, vma); 138 139 /* 140 * Request to mmap the BAR. Map to the CPU accessible memory on the 141 * GPU using the memory information gathered from the system ACPI 142 * tables. 143 */ 144 pgoff = vma->vm_pgoff & 145 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 146 147 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 148 check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 149 check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 150 return -EOVERFLOW; 151 152 /* 153 * Check that the mapping request does not go beyond available device 154 * memory size 155 */ 156 if (end > memregion->memlength) 157 return -EINVAL; 158 159 /* 160 * The carved out region of the device memory needs the NORMAL_NC 161 * property. Communicate as such to the hypervisor. 162 */ 163 if (index == RESMEM_REGION_INDEX) { 164 /* 165 * The nvgrace-gpu module has no issues with uncontained 166 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 167 * set to communicate to the KVM to S2 map as NORMAL_NC. 168 * This opens up guest usage of NORMAL_NC for this mapping. 169 */ 170 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 171 172 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 173 } 174 175 /* 176 * Perform a PFN map to the memory and back the device BAR by the 177 * GPU memory. 178 * 179 * The available GPU memory size may not be power-of-2 aligned. The 180 * remainder is only backed by vfio_device_ops read/write handlers. 181 * 182 * During device reset, the GPU is safely disconnected to the CPU 183 * and access to the BAR will be immediately returned preventing 184 * machine check. 185 */ 186 ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 187 req_len, vma->vm_page_prot); 188 if (ret) 189 return ret; 190 191 vma->vm_pgoff = start_pfn; 192 193 return 0; 194 } 195 196 static long 197 nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 198 unsigned long arg) 199 { 200 struct nvgrace_gpu_pci_core_device *nvdev = 201 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 202 core_device.vdev); 203 unsigned long minsz = offsetofend(struct vfio_region_info, offset); 204 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 205 struct vfio_region_info_cap_sparse_mmap *sparse; 206 struct vfio_region_info info; 207 struct mem_region *memregion; 208 u32 size; 209 int ret; 210 211 if (copy_from_user(&info, (void __user *)arg, minsz)) 212 return -EFAULT; 213 214 if (info.argsz < minsz) 215 return -EINVAL; 216 217 /* 218 * Request to determine the BAR region information. Send the 219 * GPU memory information. 220 */ 221 memregion = nvgrace_gpu_memregion(info.index, nvdev); 222 if (!memregion) 223 return vfio_pci_core_ioctl(core_vdev, 224 VFIO_DEVICE_GET_REGION_INFO, arg); 225 226 size = struct_size(sparse, areas, 1); 227 228 /* 229 * Setup for sparse mapping for the device memory. Only the 230 * available device memory on the hardware is shown as a 231 * mappable region. 232 */ 233 sparse = kzalloc(size, GFP_KERNEL); 234 if (!sparse) 235 return -ENOMEM; 236 237 sparse->nr_areas = 1; 238 sparse->areas[0].offset = 0; 239 sparse->areas[0].size = memregion->memlength; 240 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 241 sparse->header.version = 1; 242 243 ret = vfio_info_add_capability(&caps, &sparse->header, size); 244 kfree(sparse); 245 if (ret) 246 return ret; 247 248 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 249 /* 250 * The region memory size may not be power-of-2 aligned. 251 * Given that the memory as a BAR and may not be 252 * aligned, roundup to the next power-of-2. 253 */ 254 info.size = memregion->bar_size; 255 info.flags = VFIO_REGION_INFO_FLAG_READ | 256 VFIO_REGION_INFO_FLAG_WRITE | 257 VFIO_REGION_INFO_FLAG_MMAP; 258 259 if (caps.size) { 260 info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 261 if (info.argsz < sizeof(info) + caps.size) { 262 info.argsz = sizeof(info) + caps.size; 263 info.cap_offset = 0; 264 } else { 265 vfio_info_cap_shift(&caps, sizeof(info)); 266 if (copy_to_user((void __user *)arg + 267 sizeof(info), caps.buf, 268 caps.size)) { 269 kfree(caps.buf); 270 return -EFAULT; 271 } 272 info.cap_offset = sizeof(info); 273 } 274 kfree(caps.buf); 275 } 276 return copy_to_user((void __user *)arg, &info, minsz) ? 277 -EFAULT : 0; 278 } 279 280 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 281 unsigned int cmd, unsigned long arg) 282 { 283 switch (cmd) { 284 case VFIO_DEVICE_GET_REGION_INFO: 285 return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); 286 case VFIO_DEVICE_IOEVENTFD: 287 return -ENOTTY; 288 case VFIO_DEVICE_RESET: 289 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 290 fallthrough; 291 default: 292 return vfio_pci_core_ioctl(core_vdev, cmd, arg); 293 } 294 } 295 296 static __le64 297 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 298 { 299 u64 tmp_val; 300 301 tmp_val = le64_to_cpu(val64); 302 tmp_val &= ~(bar_size - 1); 303 tmp_val |= flags; 304 305 return cpu_to_le64(tmp_val); 306 } 307 308 /* 309 * Both the usable (usemem) and the reserved (resmem) device memory region 310 * are exposed as a 64b fake device BARs in the VM. These fake BARs must 311 * respond to the accesses on their respective PCI config space offsets. 312 * 313 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 314 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 315 */ 316 static ssize_t 317 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 318 char __user *buf, size_t count, loff_t *ppos) 319 { 320 struct nvgrace_gpu_pci_core_device *nvdev = 321 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 322 core_device.vdev); 323 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 324 struct mem_region *memregion = NULL; 325 __le64 val64; 326 size_t register_offset; 327 loff_t copy_offset; 328 size_t copy_count; 329 int ret; 330 331 ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 332 if (ret < 0) 333 return ret; 334 335 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 336 sizeof(val64), 337 ©_offset, ©_count, 338 ®ister_offset)) 339 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 340 else if (vfio_pci_core_range_intersect_range(pos, count, 341 PCI_BASE_ADDRESS_4, 342 sizeof(val64), 343 ©_offset, ©_count, 344 ®ister_offset)) 345 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 346 347 if (memregion) { 348 val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 349 PCI_BASE_ADDRESS_MEM_TYPE_64 | 350 PCI_BASE_ADDRESS_MEM_PREFETCH, 351 memregion->bar_val); 352 if (copy_to_user(buf + copy_offset, 353 (void *)&val64 + register_offset, copy_count)) { 354 /* 355 * The position has been incremented in 356 * vfio_pci_core_read. Reset the offset back to the 357 * starting position. 358 */ 359 *ppos -= count; 360 return -EFAULT; 361 } 362 } 363 364 return count; 365 } 366 367 static ssize_t 368 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 369 const char __user *buf, size_t count, loff_t *ppos) 370 { 371 struct nvgrace_gpu_pci_core_device *nvdev = 372 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 373 core_device.vdev); 374 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 375 struct mem_region *memregion = NULL; 376 size_t register_offset; 377 loff_t copy_offset; 378 size_t copy_count; 379 380 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 381 sizeof(u64), ©_offset, 382 ©_count, ®ister_offset)) 383 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 384 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 385 sizeof(u64), ©_offset, 386 ©_count, ®ister_offset)) 387 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 388 389 if (memregion) { 390 if (copy_from_user((void *)&memregion->bar_val + register_offset, 391 buf + copy_offset, copy_count)) 392 return -EFAULT; 393 *ppos += copy_count; 394 return copy_count; 395 } 396 397 return vfio_pci_core_write(core_vdev, buf, count, ppos); 398 } 399 400 /* 401 * Ad hoc map the device memory in the module kernel VA space. Primarily needed 402 * as vfio does not require the userspace driver to only perform accesses through 403 * mmaps of the vfio-pci BAR regions and such accesses should be supported using 404 * vfio_device_ops read/write implementations. 405 * 406 * The usemem region is cacheable memory and hence is memremaped. 407 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 408 */ 409 static int 410 nvgrace_gpu_map_device_mem(int index, 411 struct nvgrace_gpu_pci_core_device *nvdev) 412 { 413 struct mem_region *memregion; 414 int ret = 0; 415 416 memregion = nvgrace_gpu_memregion(index, nvdev); 417 if (!memregion) 418 return -EINVAL; 419 420 mutex_lock(&nvdev->remap_lock); 421 422 if (memregion->memaddr) 423 goto unlock; 424 425 if (index == USEMEM_REGION_INDEX) 426 memregion->memaddr = memremap(memregion->memphys, 427 memregion->memlength, 428 MEMREMAP_WB); 429 else 430 memregion->ioaddr = ioremap_wc(memregion->memphys, 431 memregion->memlength); 432 433 if (!memregion->memaddr) 434 ret = -ENOMEM; 435 436 unlock: 437 mutex_unlock(&nvdev->remap_lock); 438 439 return ret; 440 } 441 442 /* 443 * Read the data from the device memory (mapped either through ioremap 444 * or memremap) into the user buffer. 445 */ 446 static int 447 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 448 char __user *buf, size_t mem_count, loff_t *ppos) 449 { 450 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 451 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 452 int ret; 453 454 if (!mem_count) 455 return 0; 456 457 /* 458 * Handle read on the BAR regions. Map to the target device memory 459 * physical address and copy to the request read buffer. 460 */ 461 ret = nvgrace_gpu_map_device_mem(index, nvdev); 462 if (ret) 463 return ret; 464 465 if (index == USEMEM_REGION_INDEX) { 466 if (copy_to_user(buf, 467 (u8 *)nvdev->usemem.memaddr + offset, 468 mem_count)) 469 ret = -EFAULT; 470 } else { 471 /* 472 * The hardware ensures that the system does not crash when 473 * the device memory is accessed with the memory enable 474 * turned off. It synthesizes ~0 on such read. So there is 475 * no need to check or support the disablement/enablement of 476 * BAR through PCI_COMMAND config space register. Pass 477 * test_mem flag as false. 478 */ 479 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 480 nvdev->resmem.ioaddr, 481 buf, offset, mem_count, 482 0, 0, false); 483 } 484 485 return ret; 486 } 487 488 /* 489 * Read count bytes from the device memory at an offset. The actual device 490 * memory size (available) may not be a power-of-2. So the driver fakes 491 * the size to a power-of-2 (reported) when exposing to a user space driver. 492 * 493 * Reads starting beyond the reported size generate -EINVAL; reads extending 494 * beyond the actual device size is filled with ~0; reads extending beyond 495 * the reported size are truncated. 496 */ 497 static ssize_t 498 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 499 char __user *buf, size_t count, loff_t *ppos) 500 { 501 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 502 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 503 struct mem_region *memregion; 504 size_t mem_count, i; 505 u8 val = 0xFF; 506 int ret; 507 508 /* No need to do NULL check as caller does. */ 509 memregion = nvgrace_gpu_memregion(index, nvdev); 510 511 if (offset >= memregion->bar_size) 512 return -EINVAL; 513 514 /* Clip short the read request beyond reported BAR size */ 515 count = min(count, memregion->bar_size - (size_t)offset); 516 517 /* 518 * Determine how many bytes to be actually read from the device memory. 519 * Read request beyond the actual device memory size is filled with ~0, 520 * while those beyond the actual reported size is skipped. 521 */ 522 if (offset >= memregion->memlength) 523 mem_count = 0; 524 else 525 mem_count = min(count, memregion->memlength - (size_t)offset); 526 527 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 528 if (ret) 529 return ret; 530 531 /* 532 * Only the device memory present on the hardware is mapped, which may 533 * not be power-of-2 aligned. A read to an offset beyond the device memory 534 * size is filled with ~0. 535 */ 536 for (i = mem_count; i < count; i++) { 537 ret = put_user(val, (unsigned char __user *)(buf + i)); 538 if (ret) 539 return ret; 540 } 541 542 *ppos += count; 543 return count; 544 } 545 546 static ssize_t 547 nvgrace_gpu_read(struct vfio_device *core_vdev, 548 char __user *buf, size_t count, loff_t *ppos) 549 { 550 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 551 struct nvgrace_gpu_pci_core_device *nvdev = 552 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 553 core_device.vdev); 554 555 if (nvgrace_gpu_memregion(index, nvdev)) 556 return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 557 558 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 559 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 560 561 return vfio_pci_core_read(core_vdev, buf, count, ppos); 562 } 563 564 /* 565 * Write the data to the device memory (mapped either through ioremap 566 * or memremap) from the user buffer. 567 */ 568 static int 569 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 570 const char __user *buf, size_t mem_count, 571 loff_t *ppos) 572 { 573 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 574 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 575 int ret; 576 577 if (!mem_count) 578 return 0; 579 580 ret = nvgrace_gpu_map_device_mem(index, nvdev); 581 if (ret) 582 return ret; 583 584 if (index == USEMEM_REGION_INDEX) { 585 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 586 buf, mem_count)) 587 return -EFAULT; 588 } else { 589 /* 590 * The hardware ensures that the system does not crash when 591 * the device memory is accessed with the memory enable 592 * turned off. It drops such writes. So there is no need to 593 * check or support the disablement/enablement of BAR 594 * through PCI_COMMAND config space register. Pass test_mem 595 * flag as false. 596 */ 597 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 598 nvdev->resmem.ioaddr, 599 (char __user *)buf, pos, mem_count, 600 0, 0, true); 601 } 602 603 return ret; 604 } 605 606 /* 607 * Write count bytes to the device memory at a given offset. The actual device 608 * memory size (available) may not be a power-of-2. So the driver fakes the 609 * size to a power-of-2 (reported) when exposing to a user space driver. 610 * 611 * Writes extending beyond the reported size are truncated; writes starting 612 * beyond the reported size generate -EINVAL. 613 */ 614 static ssize_t 615 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 616 size_t count, loff_t *ppos, const char __user *buf) 617 { 618 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 619 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 620 struct mem_region *memregion; 621 size_t mem_count; 622 int ret = 0; 623 624 /* No need to do NULL check as caller does. */ 625 memregion = nvgrace_gpu_memregion(index, nvdev); 626 627 if (offset >= memregion->bar_size) 628 return -EINVAL; 629 630 /* Clip short the write request beyond reported BAR size */ 631 count = min(count, memregion->bar_size - (size_t)offset); 632 633 /* 634 * Determine how many bytes to be actually written to the device memory. 635 * Do not write to the offset beyond available size. 636 */ 637 if (offset >= memregion->memlength) 638 goto exitfn; 639 640 /* 641 * Only the device memory present on the hardware is mapped, which may 642 * not be power-of-2 aligned. Drop access outside the available device 643 * memory on the hardware. 644 */ 645 mem_count = min(count, memregion->memlength - (size_t)offset); 646 647 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 648 if (ret) 649 return ret; 650 651 exitfn: 652 *ppos += count; 653 return count; 654 } 655 656 static ssize_t 657 nvgrace_gpu_write(struct vfio_device *core_vdev, 658 const char __user *buf, size_t count, loff_t *ppos) 659 { 660 struct nvgrace_gpu_pci_core_device *nvdev = 661 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 662 core_device.vdev); 663 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 664 665 if (nvgrace_gpu_memregion(index, nvdev)) 666 return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 667 668 if (index == VFIO_PCI_CONFIG_REGION_INDEX) 669 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 670 671 return vfio_pci_core_write(core_vdev, buf, count, ppos); 672 } 673 674 static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 675 .name = "nvgrace-gpu-vfio-pci", 676 .init = vfio_pci_core_init_dev, 677 .release = vfio_pci_core_release_dev, 678 .open_device = nvgrace_gpu_open_device, 679 .close_device = nvgrace_gpu_close_device, 680 .ioctl = nvgrace_gpu_ioctl, 681 .device_feature = vfio_pci_core_ioctl_feature, 682 .read = nvgrace_gpu_read, 683 .write = nvgrace_gpu_write, 684 .mmap = nvgrace_gpu_mmap, 685 .request = vfio_pci_core_request, 686 .match = vfio_pci_core_match, 687 .bind_iommufd = vfio_iommufd_physical_bind, 688 .unbind_iommufd = vfio_iommufd_physical_unbind, 689 .attach_ioas = vfio_iommufd_physical_attach_ioas, 690 .detach_ioas = vfio_iommufd_physical_detach_ioas, 691 }; 692 693 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 694 .name = "nvgrace-gpu-vfio-pci-core", 695 .init = vfio_pci_core_init_dev, 696 .release = vfio_pci_core_release_dev, 697 .open_device = nvgrace_gpu_open_device, 698 .close_device = vfio_pci_core_close_device, 699 .ioctl = vfio_pci_core_ioctl, 700 .device_feature = vfio_pci_core_ioctl_feature, 701 .read = vfio_pci_core_read, 702 .write = vfio_pci_core_write, 703 .mmap = vfio_pci_core_mmap, 704 .request = vfio_pci_core_request, 705 .match = vfio_pci_core_match, 706 .bind_iommufd = vfio_iommufd_physical_bind, 707 .unbind_iommufd = vfio_iommufd_physical_unbind, 708 .attach_ioas = vfio_iommufd_physical_attach_ioas, 709 .detach_ioas = vfio_iommufd_physical_detach_ioas, 710 }; 711 712 static int 713 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 714 u64 *pmemphys, u64 *pmemlength) 715 { 716 int ret; 717 718 /* 719 * The memory information is present in the system ACPI tables as DSD 720 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 721 */ 722 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 723 pmemphys); 724 if (ret) 725 return ret; 726 727 if (*pmemphys > type_max(phys_addr_t)) 728 return -EOVERFLOW; 729 730 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 731 pmemlength); 732 if (ret) 733 return ret; 734 735 if (*pmemlength > type_max(size_t)) 736 return -EOVERFLOW; 737 738 /* 739 * If the C2C link is not up due to an error, the coherent device 740 * memory size is returned as 0. Fail in such case. 741 */ 742 if (*pmemlength == 0) 743 return -ENOMEM; 744 745 return ret; 746 } 747 748 static int 749 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 750 struct nvgrace_gpu_pci_core_device *nvdev, 751 u64 memphys, u64 memlength) 752 { 753 int ret = 0; 754 755 /* 756 * The VM GPU device driver needs a non-cacheable region to support 757 * the MIG feature. Since the device memory is mapped as NORMAL cached, 758 * carve out a region from the end with a different NORMAL_NC 759 * property (called as reserved memory and represented as resmem). This 760 * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while 761 * exposing the rest (termed as usable memory and represented using usemem) 762 * as cacheable 64b BAR (region 4 and 5). 763 * 764 * devmem (memlength) 765 * |-------------------------------------------------| 766 * | | 767 * usemem.memphys resmem.memphys 768 */ 769 nvdev->usemem.memphys = memphys; 770 771 /* 772 * The device memory exposed to the VM is added to the kernel by the 773 * VM driver module in chunks of memory block size. Only the usable 774 * memory (usemem) is added to the kernel for usage by the VM 775 * workloads. Make the usable memory size memblock aligned. 776 */ 777 if (check_sub_overflow(memlength, RESMEM_SIZE, 778 &nvdev->usemem.memlength)) { 779 ret = -EOVERFLOW; 780 goto done; 781 } 782 783 /* 784 * The USEMEM part of the device memory has to be MEMBLK_SIZE 785 * aligned. This is a hardwired ABI value between the GPU FW and 786 * VFIO driver. The VM device driver is also aware of it and make 787 * use of the value for its calculation to determine USEMEM size. 788 */ 789 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 790 MEMBLK_SIZE); 791 if (nvdev->usemem.memlength == 0) { 792 ret = -EINVAL; 793 goto done; 794 } 795 796 if ((check_add_overflow(nvdev->usemem.memphys, 797 nvdev->usemem.memlength, 798 &nvdev->resmem.memphys)) || 799 (check_sub_overflow(memlength, nvdev->usemem.memlength, 800 &nvdev->resmem.memlength))) { 801 ret = -EOVERFLOW; 802 goto done; 803 } 804 805 /* 806 * The memory regions are exposed as BARs. Calculate and save 807 * the BAR size for them. 808 */ 809 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 810 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 811 done: 812 return ret; 813 } 814 815 static int nvgrace_gpu_probe(struct pci_dev *pdev, 816 const struct pci_device_id *id) 817 { 818 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 819 struct nvgrace_gpu_pci_core_device *nvdev; 820 u64 memphys, memlength; 821 int ret; 822 823 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 824 if (!ret) 825 ops = &nvgrace_gpu_pci_ops; 826 827 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 828 &pdev->dev, ops); 829 if (IS_ERR(nvdev)) 830 return PTR_ERR(nvdev); 831 832 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 833 834 if (ops == &nvgrace_gpu_pci_ops) { 835 /* 836 * Device memory properties are identified in the host ACPI 837 * table. Set the nvgrace_gpu_pci_core_device structure. 838 */ 839 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 840 memphys, memlength); 841 if (ret) 842 goto out_put_vdev; 843 } 844 845 ret = vfio_pci_core_register_device(&nvdev->core_device); 846 if (ret) 847 goto out_put_vdev; 848 849 return ret; 850 851 out_put_vdev: 852 vfio_put_device(&nvdev->core_device.vdev); 853 return ret; 854 } 855 856 static void nvgrace_gpu_remove(struct pci_dev *pdev) 857 { 858 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 859 860 vfio_pci_core_unregister_device(core_device); 861 vfio_put_device(&core_device->vdev); 862 } 863 864 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 865 /* GH200 120GB */ 866 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 867 /* GH200 480GB */ 868 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 869 {} 870 }; 871 872 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 873 874 static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 875 .name = KBUILD_MODNAME, 876 .id_table = nvgrace_gpu_vfio_pci_table, 877 .probe = nvgrace_gpu_probe, 878 .remove = nvgrace_gpu_remove, 879 .err_handler = &vfio_pci_core_err_handlers, 880 .driver_managed_dma = true, 881 }; 882 883 module_pci_driver(nvgrace_gpu_vfio_pci_driver); 884 885 MODULE_LICENSE("GPL"); 886 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 887 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 888 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); 889