xref: /linux/drivers/vfio/pci/nvgrace-gpu/main.c (revision ba23adb6533149df33b9a247f31a87227b3c86d5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/sizes.h>
7 #include <linux/vfio_pci_core.h>
8 #include <linux/delay.h>
9 #include <linux/jiffies.h>
10 #include <linux/pci-p2pdma.h>
11 #include <linux/pm_runtime.h>
12 #include <linux/memory-failure.h>
13 
14 /*
15  * The device memory usable to the workloads running in the VM is cached
16  * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
17  * to the VM and is represented as usemem.
18  * Moreover, the VM GPU device driver needs a non-cacheable region to
19  * support the MIG feature. This region is also exposed as a 64b BAR
20  * (comprising of BAR2 and BAR3 region) and represented as resmem.
21  */
22 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
23 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
24 
25 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
26 #define MEMBLK_SIZE SZ_512M
27 
28 #define DVSEC_BITMAP_OFFSET 0xA
29 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
30 
31 #define GPU_CAP_DVSEC_REGISTER 3
32 
33 #define C2C_LINK_BAR0_OFFSET 0x1498
34 #define HBM_TRAINING_BAR0_OFFSET 0x200BC
35 #define STATUS_READY 0xFF
36 
37 #define POLL_QUANTUM_MS 1000
38 #define POLL_TIMEOUT_MS (30 * 1000)
39 
40 /*
41  * The state of the two device memory region - resmem and usemem - is
42  * saved as struct mem_region.
43  */
44 struct mem_region {
45 	phys_addr_t memphys;    /* Base physical address of the region */
46 	size_t memlength;       /* Region size */
47 	size_t bar_size;        /* Reported region BAR size */
48 	__le64 bar_val;         /* Emulated BAR offset registers */
49 	union {
50 		void *memaddr;
51 		void __iomem *ioaddr;
52 	};                      /* Base virtual address of the region */
53 	struct pfn_address_space pfn_address_space;
54 };
55 
56 struct nvgrace_gpu_pci_core_device {
57 	struct vfio_pci_core_device core_device;
58 	/* Cached and usable memory for the VM. */
59 	struct mem_region usemem;
60 	/* Non cached memory carved out from the end of device memory */
61 	struct mem_region resmem;
62 	/* Lock to control device memory kernel mapping */
63 	struct mutex remap_lock;
64 	bool has_mig_hw_bug;
65 	/* GPU has just been reset */
66 	bool reset_done;
67 };
68 
nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device * core_vdev)69 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
70 {
71 	struct nvgrace_gpu_pci_core_device *nvdev =
72 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
73 			     core_device.vdev);
74 
75 	nvdev->resmem.bar_val = 0;
76 	nvdev->usemem.bar_val = 0;
77 }
78 
79 /* Choose the structure corresponding to the fake BAR with a given index. */
80 static struct mem_region *
nvgrace_gpu_memregion(int index,struct nvgrace_gpu_pci_core_device * nvdev)81 nvgrace_gpu_memregion(int index,
82 		      struct nvgrace_gpu_pci_core_device *nvdev)
83 {
84 	if (index == USEMEM_REGION_INDEX)
85 		return &nvdev->usemem;
86 
87 	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
88 		return &nvdev->resmem;
89 
90 	return NULL;
91 }
92 
pfn_memregion_offset(struct nvgrace_gpu_pci_core_device * nvdev,unsigned int index,unsigned long pfn,pgoff_t * pfn_offset_in_region)93 static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
94 				unsigned int index,
95 				unsigned long pfn,
96 				pgoff_t *pfn_offset_in_region)
97 {
98 	struct mem_region *region;
99 	unsigned long start_pfn, num_pages;
100 
101 	region = nvgrace_gpu_memregion(index, nvdev);
102 	if (!region)
103 		return -EINVAL;
104 
105 	start_pfn = PHYS_PFN(region->memphys);
106 	num_pages = region->memlength >> PAGE_SHIFT;
107 
108 	if (pfn < start_pfn || pfn >= start_pfn + num_pages)
109 		return -EFAULT;
110 
111 	*pfn_offset_in_region = pfn - start_pfn;
112 
113 	return 0;
114 }
115 
116 static inline
117 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);
118 
nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct * vma,unsigned long pfn,pgoff_t * pgoff)119 static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
120 					unsigned long pfn,
121 					pgoff_t *pgoff)
122 {
123 	struct nvgrace_gpu_pci_core_device *nvdev;
124 	unsigned int index =
125 		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
126 	pgoff_t vma_offset_in_region = vma->vm_pgoff &
127 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
128 	pgoff_t pfn_offset_in_region;
129 	int ret;
130 
131 	nvdev = vma_to_nvdev(vma);
132 	if (!nvdev)
133 		return -ENOENT;
134 
135 	ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
136 	if (ret)
137 		return ret;
138 
139 	/* Ensure PFN is not before VMA's start within the region */
140 	if (pfn_offset_in_region < vma_offset_in_region)
141 		return -EFAULT;
142 
143 	/* Calculate offset from VMA start */
144 	*pgoff = vma->vm_pgoff +
145 		 (pfn_offset_in_region - vma_offset_in_region);
146 
147 	return 0;
148 }
149 
150 static int
nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device * core_vdev,struct mem_region * region)151 nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
152 					struct mem_region *region)
153 {
154 	unsigned long pfn, nr_pages;
155 
156 	pfn = PHYS_PFN(region->memphys);
157 	nr_pages = region->memlength >> PAGE_SHIFT;
158 
159 	region->pfn_address_space.node.start = pfn;
160 	region->pfn_address_space.node.last = pfn + nr_pages - 1;
161 	region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
162 	region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
163 
164 	return register_pfn_address_space(&region->pfn_address_space);
165 }
166 
nvgrace_gpu_open_device(struct vfio_device * core_vdev)167 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
168 {
169 	struct vfio_pci_core_device *vdev =
170 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
171 	struct nvgrace_gpu_pci_core_device *nvdev =
172 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
173 			     core_device.vdev);
174 	int ret;
175 
176 	ret = vfio_pci_core_enable(vdev);
177 	if (ret)
178 		return ret;
179 
180 	if (nvdev->usemem.memlength) {
181 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
182 		mutex_init(&nvdev->remap_lock);
183 	}
184 
185 	/*
186 	 * GPU readiness is checked by reading the BAR0 registers.
187 	 *
188 	 * ioremap BAR0 to ensure that the BAR0 mapping is present before
189 	 * register reads on first fault before establishing any GPU
190 	 * memory mapping.
191 	 */
192 	ret = vfio_pci_core_setup_barmap(vdev, 0);
193 	if (ret)
194 		goto error_exit;
195 
196 	if (nvdev->resmem.memlength) {
197 		ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
198 		if (ret && ret != -EOPNOTSUPP)
199 			goto error_exit;
200 	}
201 
202 	ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
203 	if (ret && ret != -EOPNOTSUPP)
204 		goto register_mem_failed;
205 
206 	vfio_pci_core_finish_enable(vdev);
207 	return 0;
208 
209 register_mem_failed:
210 	if (nvdev->resmem.memlength)
211 		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
212 error_exit:
213 	vfio_pci_core_disable(vdev);
214 	return ret;
215 }
216 
nvgrace_gpu_close_device(struct vfio_device * core_vdev)217 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
218 {
219 	struct nvgrace_gpu_pci_core_device *nvdev =
220 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
221 			     core_device.vdev);
222 
223 	if (nvdev->resmem.memlength)
224 		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
225 
226 	unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
227 
228 	/* Unmap the mapping to the device memory cached region */
229 	if (nvdev->usemem.memaddr) {
230 		memunmap(nvdev->usemem.memaddr);
231 		nvdev->usemem.memaddr = NULL;
232 	}
233 
234 	/* Unmap the mapping to the device memory non-cached region */
235 	if (nvdev->resmem.ioaddr) {
236 		iounmap(nvdev->resmem.ioaddr);
237 		nvdev->resmem.ioaddr = NULL;
238 	}
239 
240 	mutex_destroy(&nvdev->remap_lock);
241 
242 	vfio_pci_core_close_device(core_vdev);
243 }
244 
nvgrace_gpu_wait_device_ready(void __iomem * io)245 static int nvgrace_gpu_wait_device_ready(void __iomem *io)
246 {
247 	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
248 
249 	do {
250 		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
251 		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY))
252 			return 0;
253 		msleep(POLL_QUANTUM_MS);
254 	} while (!time_after(jiffies, timeout));
255 
256 	return -ETIME;
257 }
258 
259 /*
260  * If the GPU memory is accessed by the CPU while the GPU is not ready
261  * after reset, it can cause harmless corrected RAS events to be logged.
262  * Make sure the GPU is ready before establishing the mappings.
263  */
264 static int
nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device * nvdev)265 nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
266 {
267 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
268 	int ret;
269 
270 	lockdep_assert_held_read(&vdev->memory_lock);
271 
272 	if (!nvdev->reset_done)
273 		return 0;
274 
275 	if (!__vfio_pci_memory_enabled(vdev))
276 		return -EIO;
277 
278 	ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
279 	if (ret)
280 		return ret;
281 
282 	nvdev->reset_done = false;
283 
284 	return 0;
285 }
286 
addr_to_pgoff(struct vm_area_struct * vma,unsigned long addr)287 static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
288 				   unsigned long addr)
289 {
290 	u64 pgoff = vma->vm_pgoff &
291 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
292 
293 	return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
294 }
295 
nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault * vmf,unsigned int order)296 static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
297 						  unsigned int order)
298 {
299 	struct vm_area_struct *vma = vmf->vma;
300 	struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
301 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
302 	unsigned int index =
303 		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
304 	vm_fault_t ret = VM_FAULT_FALLBACK;
305 	struct mem_region *memregion;
306 	unsigned long pfn, addr;
307 
308 	memregion = nvgrace_gpu_memregion(index, nvdev);
309 	if (!memregion)
310 		return VM_FAULT_SIGBUS;
311 
312 	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
313 	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
314 
315 	if (is_aligned_for_order(vma, addr, pfn, order)) {
316 		scoped_guard(rwsem_read, &vdev->memory_lock) {
317 			if (vdev->pm_runtime_engaged ||
318 			    nvgrace_gpu_check_device_ready(nvdev))
319 				return VM_FAULT_SIGBUS;
320 
321 			ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
322 		}
323 	}
324 
325 	dev_dbg_ratelimited(&vdev->pdev->dev,
326 			    "%s order = %d pfn 0x%lx: 0x%x\n",
327 			    __func__, order, pfn,
328 			    (unsigned int)ret);
329 
330 	return ret;
331 }
332 
nvgrace_gpu_vfio_pci_fault(struct vm_fault * vmf)333 static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
334 {
335 	return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
336 }
337 
338 static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
339 	.fault = nvgrace_gpu_vfio_pci_fault,
340 #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
341 	.huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
342 #endif
343 };
344 
345 static inline
vma_to_nvdev(struct vm_area_struct * vma)346 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
347 {
348 	/* Check if this VMA belongs to us */
349 	if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
350 		return NULL;
351 
352 	return vma->vm_private_data;
353 }
354 
nvgrace_gpu_mmap(struct vfio_device * core_vdev,struct vm_area_struct * vma)355 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
356 			    struct vm_area_struct *vma)
357 {
358 	struct nvgrace_gpu_pci_core_device *nvdev =
359 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
360 			     core_device.vdev);
361 	struct mem_region *memregion;
362 	u64 req_len, pgoff, end;
363 	unsigned int index;
364 
365 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
366 
367 	memregion = nvgrace_gpu_memregion(index, nvdev);
368 	if (!memregion)
369 		return vfio_pci_core_mmap(core_vdev, vma);
370 
371 	/*
372 	 * Request to mmap the BAR. Map to the CPU accessible memory on the
373 	 * GPU using the memory information gathered from the system ACPI
374 	 * tables.
375 	 */
376 	pgoff = vma->vm_pgoff &
377 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
378 
379 	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
380 	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
381 		return -EOVERFLOW;
382 
383 	/*
384 	 * Check that the mapping request does not go beyond the exposed
385 	 * device memory size.
386 	 */
387 	if (end > memregion->memlength)
388 		return -EINVAL;
389 
390 	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
391 
392 	/*
393 	 * The carved out region of the device memory needs the NORMAL_NC
394 	 * property. Communicate as such to the hypervisor.
395 	 */
396 	if (index == RESMEM_REGION_INDEX) {
397 		/*
398 		 * The nvgrace-gpu module has no issues with uncontained
399 		 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
400 		 * set to communicate to the KVM to S2 map as NORMAL_NC.
401 		 * This opens up guest usage of NORMAL_NC for this mapping.
402 		 */
403 		vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
404 
405 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
406 	}
407 
408 	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
409 	vma->vm_private_data = nvdev;
410 
411 	return 0;
412 }
413 
nvgrace_gpu_ioctl_get_region_info(struct vfio_device * core_vdev,struct vfio_region_info * info,struct vfio_info_cap * caps)414 static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
415 					     struct vfio_region_info *info,
416 					     struct vfio_info_cap *caps)
417 {
418 	struct nvgrace_gpu_pci_core_device *nvdev =
419 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
420 			     core_device.vdev);
421 	struct vfio_region_info_cap_sparse_mmap *sparse;
422 	struct mem_region *memregion;
423 	u32 size;
424 	int ret;
425 
426 	/*
427 	 * Request to determine the BAR region information. Send the
428 	 * GPU memory information.
429 	 */
430 	memregion = nvgrace_gpu_memregion(info->index, nvdev);
431 	if (!memregion)
432 		return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
433 
434 	size = struct_size(sparse, areas, 1);
435 
436 	/*
437 	 * Setup for sparse mapping for the device memory. Only the
438 	 * available device memory on the hardware is shown as a
439 	 * mappable region.
440 	 */
441 	sparse = kzalloc(size, GFP_KERNEL);
442 	if (!sparse)
443 		return -ENOMEM;
444 
445 	sparse->nr_areas = 1;
446 	sparse->areas[0].offset = 0;
447 	sparse->areas[0].size = memregion->memlength;
448 	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
449 	sparse->header.version = 1;
450 
451 	ret = vfio_info_add_capability(caps, &sparse->header, size);
452 	kfree(sparse);
453 	if (ret)
454 		return ret;
455 
456 	info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
457 	/*
458 	 * The region memory size may not be power-of-2 aligned.
459 	 * Given that the memory is a BAR and may not be
460 	 * aligned, roundup to the next power-of-2.
461 	 */
462 	info->size = memregion->bar_size;
463 	info->flags = VFIO_REGION_INFO_FLAG_READ |
464 		     VFIO_REGION_INFO_FLAG_WRITE |
465 		     VFIO_REGION_INFO_FLAG_MMAP;
466 	return 0;
467 }
468 
nvgrace_gpu_ioctl(struct vfio_device * core_vdev,unsigned int cmd,unsigned long arg)469 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
470 			      unsigned int cmd, unsigned long arg)
471 {
472 	switch (cmd) {
473 	case VFIO_DEVICE_IOEVENTFD:
474 		return -ENOTTY;
475 	case VFIO_DEVICE_RESET:
476 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
477 		fallthrough;
478 	default:
479 		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
480 	}
481 }
482 
483 static __le64
nvgrace_gpu_get_read_value(size_t bar_size,u64 flags,__le64 val64)484 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
485 {
486 	u64 tmp_val;
487 
488 	tmp_val = le64_to_cpu(val64);
489 	tmp_val &= ~(bar_size - 1);
490 	tmp_val |= flags;
491 
492 	return cpu_to_le64(tmp_val);
493 }
494 
495 /*
496  * Both the usable (usemem) and the reserved (resmem) device memory region
497  * are exposed as a 64b fake device BARs in the VM. These fake BARs must
498  * respond to the accesses on their respective PCI config space offsets.
499  *
500  * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
501  * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
502  */
503 static ssize_t
nvgrace_gpu_read_config_emu(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)504 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
505 			    char __user *buf, size_t count, loff_t *ppos)
506 {
507 	struct nvgrace_gpu_pci_core_device *nvdev =
508 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
509 			     core_device.vdev);
510 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
511 	struct mem_region *memregion = NULL;
512 	__le64 val64;
513 	size_t register_offset;
514 	loff_t copy_offset;
515 	size_t copy_count;
516 	int ret;
517 
518 	ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
519 	if (ret < 0)
520 		return ret;
521 
522 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
523 						sizeof(val64),
524 						&copy_offset, &copy_count,
525 						&register_offset))
526 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
527 	else if (vfio_pci_core_range_intersect_range(pos, count,
528 						     PCI_BASE_ADDRESS_4,
529 						     sizeof(val64),
530 						     &copy_offset, &copy_count,
531 						     &register_offset))
532 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
533 
534 	if (memregion) {
535 		val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
536 						   PCI_BASE_ADDRESS_MEM_TYPE_64 |
537 						   PCI_BASE_ADDRESS_MEM_PREFETCH,
538 						   memregion->bar_val);
539 		if (copy_to_user(buf + copy_offset,
540 				 (void *)&val64 + register_offset, copy_count)) {
541 			/*
542 			 * The position has been incremented in
543 			 * vfio_pci_core_read. Reset the offset back to the
544 			 * starting position.
545 			 */
546 			*ppos -= count;
547 			return -EFAULT;
548 		}
549 	}
550 
551 	return count;
552 }
553 
554 static ssize_t
nvgrace_gpu_write_config_emu(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)555 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
556 			     const char __user *buf, size_t count, loff_t *ppos)
557 {
558 	struct nvgrace_gpu_pci_core_device *nvdev =
559 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
560 			     core_device.vdev);
561 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
562 	struct mem_region *memregion = NULL;
563 	size_t register_offset;
564 	loff_t copy_offset;
565 	size_t copy_count;
566 
567 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
568 						sizeof(u64), &copy_offset,
569 						&copy_count, &register_offset))
570 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
571 	else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
572 						     sizeof(u64), &copy_offset,
573 						     &copy_count, &register_offset))
574 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
575 
576 	if (memregion) {
577 		if (copy_from_user((void *)&memregion->bar_val + register_offset,
578 				   buf + copy_offset, copy_count))
579 			return -EFAULT;
580 		*ppos += copy_count;
581 		return copy_count;
582 	}
583 
584 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
585 }
586 
587 /*
588  * Ad hoc map the device memory in the module kernel VA space. Primarily needed
589  * as vfio does not require the userspace driver to only perform accesses through
590  * mmaps of the vfio-pci BAR regions and such accesses should be supported using
591  * vfio_device_ops read/write implementations.
592  *
593  * The usemem region is cacheable memory and hence is memremaped.
594  * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
595  */
596 static int
nvgrace_gpu_map_device_mem(int index,struct nvgrace_gpu_pci_core_device * nvdev)597 nvgrace_gpu_map_device_mem(int index,
598 			   struct nvgrace_gpu_pci_core_device *nvdev)
599 {
600 	struct mem_region *memregion;
601 	int ret = 0;
602 
603 	memregion = nvgrace_gpu_memregion(index, nvdev);
604 	if (!memregion)
605 		return -EINVAL;
606 
607 	mutex_lock(&nvdev->remap_lock);
608 
609 	if (memregion->memaddr)
610 		goto unlock;
611 
612 	if (index == USEMEM_REGION_INDEX)
613 		memregion->memaddr = memremap(memregion->memphys,
614 					      memregion->memlength,
615 					      MEMREMAP_WB);
616 	else
617 		memregion->ioaddr = ioremap_wc(memregion->memphys,
618 					       memregion->memlength);
619 
620 	if (!memregion->memaddr)
621 		ret = -ENOMEM;
622 
623 unlock:
624 	mutex_unlock(&nvdev->remap_lock);
625 
626 	return ret;
627 }
628 
629 /*
630  * Read the data from the device memory (mapped either through ioremap
631  * or memremap) into the user buffer.
632  */
633 static int
nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t mem_count,loff_t * ppos)634 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
635 			 char __user *buf, size_t mem_count, loff_t *ppos)
636 {
637 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
638 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
639 	int ret;
640 
641 	if (!mem_count)
642 		return 0;
643 
644 	/*
645 	 * Handle read on the BAR regions. Map to the target device memory
646 	 * physical address and copy to the request read buffer.
647 	 */
648 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
649 	if (ret)
650 		return ret;
651 
652 	if (index == USEMEM_REGION_INDEX) {
653 		if (copy_to_user(buf,
654 				 (u8 *)nvdev->usemem.memaddr + offset,
655 				 mem_count))
656 			ret = -EFAULT;
657 	} else {
658 		/*
659 		 * The hardware ensures that the system does not crash when
660 		 * the device memory is accessed with the memory enable
661 		 * turned off. It synthesizes ~0 on such read. So there is
662 		 * no need to check or support the disablement/enablement of
663 		 * BAR through PCI_COMMAND config space register. Pass
664 		 * test_mem flag as false.
665 		 */
666 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
667 					     nvdev->resmem.ioaddr,
668 					     buf, offset, mem_count,
669 					     0, 0, false, VFIO_PCI_IO_WIDTH_8);
670 	}
671 
672 	return ret;
673 }
674 
675 /*
676  * Read count bytes from the device memory at an offset. The actual device
677  * memory size (available) may not be a power-of-2. So the driver fakes
678  * the size to a power-of-2 (reported) when exposing to a user space driver.
679  *
680  * Reads starting beyond the reported size generate -EINVAL; reads extending
681  * beyond the actual device size is filled with ~0; reads extending beyond
682  * the reported size are truncated.
683  */
684 static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t count,loff_t * ppos)685 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
686 		     char __user *buf, size_t count, loff_t *ppos)
687 {
688 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
689 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
690 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
691 	struct mem_region *memregion;
692 	size_t mem_count, i;
693 	u8 val = 0xFF;
694 	int ret;
695 
696 	/* No need to do NULL check as caller does. */
697 	memregion = nvgrace_gpu_memregion(index, nvdev);
698 
699 	if (offset >= memregion->bar_size)
700 		return -EINVAL;
701 
702 	/* Clip short the read request beyond reported BAR size */
703 	count = min(count, memregion->bar_size - (size_t)offset);
704 
705 	/*
706 	 * Determine how many bytes to be actually read from the device memory.
707 	 * Read request beyond the actual device memory size is filled with ~0,
708 	 * while those beyond the actual reported size is skipped.
709 	 */
710 	if (offset >= memregion->memlength)
711 		mem_count = 0;
712 	else
713 		mem_count = min(count, memregion->memlength - (size_t)offset);
714 
715 	scoped_guard(rwsem_read, &vdev->memory_lock) {
716 		ret = nvgrace_gpu_check_device_ready(nvdev);
717 		if (ret)
718 			return ret;
719 
720 		ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
721 		if (ret)
722 			return ret;
723 	}
724 
725 	/*
726 	 * Only the device memory present on the hardware is mapped, which may
727 	 * not be power-of-2 aligned. A read to an offset beyond the device memory
728 	 * size is filled with ~0.
729 	 */
730 	for (i = mem_count; i < count; i++) {
731 		ret = put_user(val, (unsigned char __user *)(buf + i));
732 		if (ret)
733 			return ret;
734 	}
735 
736 	*ppos += count;
737 	return count;
738 }
739 
740 static ssize_t
nvgrace_gpu_read(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)741 nvgrace_gpu_read(struct vfio_device *core_vdev,
742 		 char __user *buf, size_t count, loff_t *ppos)
743 {
744 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
745 	struct nvgrace_gpu_pci_core_device *nvdev =
746 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
747 			     core_device.vdev);
748 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
749 	int ret;
750 
751 	if (nvgrace_gpu_memregion(index, nvdev)) {
752 		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
753 			return -EIO;
754 		ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
755 		pm_runtime_put(&vdev->pdev->dev);
756 		return ret;
757 	}
758 
759 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
760 		return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
761 
762 	return vfio_pci_core_read(core_vdev, buf, count, ppos);
763 }
764 
765 /*
766  * Write the data to the device memory (mapped either through ioremap
767  * or memremap) from the user buffer.
768  */
769 static int
nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device * nvdev,const char __user * buf,size_t mem_count,loff_t * ppos)770 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
771 			  const char __user *buf, size_t mem_count,
772 			  loff_t *ppos)
773 {
774 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
775 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
776 	int ret;
777 
778 	if (!mem_count)
779 		return 0;
780 
781 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
782 	if (ret)
783 		return ret;
784 
785 	if (index == USEMEM_REGION_INDEX) {
786 		if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
787 				   buf, mem_count))
788 			return -EFAULT;
789 	} else {
790 		/*
791 		 * The hardware ensures that the system does not crash when
792 		 * the device memory is accessed with the memory enable
793 		 * turned off. It drops such writes. So there is no need to
794 		 * check or support the disablement/enablement of BAR
795 		 * through PCI_COMMAND config space register. Pass test_mem
796 		 * flag as false.
797 		 */
798 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
799 					     nvdev->resmem.ioaddr,
800 					     (char __user *)buf, pos, mem_count,
801 					     0, 0, true, VFIO_PCI_IO_WIDTH_8);
802 	}
803 
804 	return ret;
805 }
806 
807 /*
808  * Write count bytes to the device memory at a given offset. The actual device
809  * memory size (available) may not be a power-of-2. So the driver fakes the
810  * size to a power-of-2 (reported) when exposing to a user space driver.
811  *
812  * Writes extending beyond the reported size are truncated; writes starting
813  * beyond the reported size generate -EINVAL.
814  */
815 static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device * nvdev,size_t count,loff_t * ppos,const char __user * buf)816 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
817 		      size_t count, loff_t *ppos, const char __user *buf)
818 {
819 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
820 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
821 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
822 	struct mem_region *memregion;
823 	size_t mem_count;
824 	int ret = 0;
825 
826 	/* No need to do NULL check as caller does. */
827 	memregion = nvgrace_gpu_memregion(index, nvdev);
828 
829 	if (offset >= memregion->bar_size)
830 		return -EINVAL;
831 
832 	/* Clip short the write request beyond reported BAR size */
833 	count = min(count, memregion->bar_size - (size_t)offset);
834 
835 	/*
836 	 * Determine how many bytes to be actually written to the device memory.
837 	 * Do not write to the offset beyond available size.
838 	 */
839 	if (offset >= memregion->memlength)
840 		goto exitfn;
841 
842 	/*
843 	 * Only the device memory present on the hardware is mapped, which may
844 	 * not be power-of-2 aligned. Drop access outside the available device
845 	 * memory on the hardware.
846 	 */
847 	mem_count = min(count, memregion->memlength - (size_t)offset);
848 
849 	scoped_guard(rwsem_read, &vdev->memory_lock) {
850 		ret = nvgrace_gpu_check_device_ready(nvdev);
851 		if (ret)
852 			return ret;
853 
854 		ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
855 		if (ret)
856 			return ret;
857 	}
858 
859 exitfn:
860 	*ppos += count;
861 	return count;
862 }
863 
864 static ssize_t
nvgrace_gpu_write(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)865 nvgrace_gpu_write(struct vfio_device *core_vdev,
866 		  const char __user *buf, size_t count, loff_t *ppos)
867 {
868 	struct nvgrace_gpu_pci_core_device *nvdev =
869 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
870 			     core_device.vdev);
871 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
872 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
873 	int ret;
874 
875 	if (nvgrace_gpu_memregion(index, nvdev)) {
876 		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
877 			return -EIO;
878 		ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
879 		pm_runtime_put(&vdev->pdev->dev);
880 		return ret;
881 	}
882 
883 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
884 		return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
885 
886 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
887 }
888 
nvgrace_get_dmabuf_phys(struct vfio_pci_core_device * core_vdev,struct p2pdma_provider ** provider,unsigned int region_index,struct phys_vec * phys_vec,struct vfio_region_dma_range * dma_ranges,size_t nr_ranges)889 static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev,
890 				   struct p2pdma_provider **provider,
891 				   unsigned int region_index,
892 				   struct phys_vec *phys_vec,
893 				   struct vfio_region_dma_range *dma_ranges,
894 				   size_t nr_ranges)
895 {
896 	struct nvgrace_gpu_pci_core_device *nvdev = container_of(
897 		core_vdev, struct nvgrace_gpu_pci_core_device, core_device);
898 	struct pci_dev *pdev = core_vdev->pdev;
899 	struct mem_region *mem_region;
900 
901 	/*
902 	 * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) {
903 	 * 	The P2P properties of the non-BAR memory is the same as the
904 	 * 	BAR memory, so just use the provider for index 0. Someday
905 	 * 	when CXL gets P2P support we could create CXLish providers
906 	 * 	for the non-BAR memory.
907 	 * } else if (region_index == USEMEM_REGION_INDEX) {
908 	 * 	This is actually cachable memory and isn't treated as P2P in
909 	 * 	the chip. For now we have no way to push cachable memory
910 	 * 	through everything and the Grace HW doesn't care what caching
911 	 * 	attribute is programmed into the SMMU. So use BAR 0.
912 	 * }
913 	 */
914 	mem_region = nvgrace_gpu_memregion(region_index, nvdev);
915 	if (mem_region) {
916 		*provider = pcim_p2pdma_provider(pdev, 0);
917 		if (!*provider)
918 			return -EINVAL;
919 		return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges,
920 						   nr_ranges,
921 						   mem_region->memphys,
922 						   mem_region->memlength);
923 	}
924 
925 	return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index,
926 					     phys_vec, dma_ranges, nr_ranges);
927 }
928 
929 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = {
930 	.get_dmabuf_phys = nvgrace_get_dmabuf_phys,
931 };
932 
933 static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
934 	.name		= "nvgrace-gpu-vfio-pci",
935 	.init		= vfio_pci_core_init_dev,
936 	.release	= vfio_pci_core_release_dev,
937 	.open_device	= nvgrace_gpu_open_device,
938 	.close_device	= nvgrace_gpu_close_device,
939 	.ioctl		= nvgrace_gpu_ioctl,
940 	.get_region_info_caps = nvgrace_gpu_ioctl_get_region_info,
941 	.device_feature	= vfio_pci_core_ioctl_feature,
942 	.read		= nvgrace_gpu_read,
943 	.write		= nvgrace_gpu_write,
944 	.mmap		= nvgrace_gpu_mmap,
945 	.request	= vfio_pci_core_request,
946 	.match		= vfio_pci_core_match,
947 	.match_token_uuid = vfio_pci_core_match_token_uuid,
948 	.bind_iommufd	= vfio_iommufd_physical_bind,
949 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
950 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
951 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
952 };
953 
954 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = {
955 	.get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
956 };
957 
958 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
959 	.name		= "nvgrace-gpu-vfio-pci-core",
960 	.init		= vfio_pci_core_init_dev,
961 	.release	= vfio_pci_core_release_dev,
962 	.open_device	= nvgrace_gpu_open_device,
963 	.close_device	= vfio_pci_core_close_device,
964 	.ioctl		= vfio_pci_core_ioctl,
965 	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
966 	.device_feature	= vfio_pci_core_ioctl_feature,
967 	.read		= vfio_pci_core_read,
968 	.write		= vfio_pci_core_write,
969 	.mmap		= vfio_pci_core_mmap,
970 	.request	= vfio_pci_core_request,
971 	.match		= vfio_pci_core_match,
972 	.match_token_uuid = vfio_pci_core_match_token_uuid,
973 	.bind_iommufd	= vfio_iommufd_physical_bind,
974 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
975 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
976 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
977 };
978 
979 static int
nvgrace_gpu_fetch_memory_property(struct pci_dev * pdev,u64 * pmemphys,u64 * pmemlength)980 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
981 				  u64 *pmemphys, u64 *pmemlength)
982 {
983 	int ret;
984 
985 	/*
986 	 * The memory information is present in the system ACPI tables as DSD
987 	 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
988 	 */
989 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
990 				       pmemphys);
991 	if (ret)
992 		return ret;
993 
994 	if (*pmemphys > type_max(phys_addr_t))
995 		return -EOVERFLOW;
996 
997 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
998 				       pmemlength);
999 	if (ret)
1000 		return ret;
1001 
1002 	if (*pmemlength > type_max(size_t))
1003 		return -EOVERFLOW;
1004 
1005 	/*
1006 	 * If the C2C link is not up due to an error, the coherent device
1007 	 * memory size is returned as 0. Fail in such case.
1008 	 */
1009 	if (*pmemlength == 0)
1010 		return -ENOMEM;
1011 
1012 	return ret;
1013 }
1014 
1015 static int
nvgrace_gpu_init_nvdev_struct(struct pci_dev * pdev,struct nvgrace_gpu_pci_core_device * nvdev,u64 memphys,u64 memlength)1016 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
1017 			      struct nvgrace_gpu_pci_core_device *nvdev,
1018 			      u64 memphys, u64 memlength)
1019 {
1020 	int ret = 0;
1021 	u64 resmem_size = 0;
1022 
1023 	/*
1024 	 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
1025 	 * region to support the MIG feature owing to a hardware bug. Since the
1026 	 * device memory is mapped as NORMAL cached, carve out a region from the end
1027 	 * with a different NORMAL_NC property (called as reserved memory and
1028 	 * represented as resmem). This region then is exposed as a 64b BAR
1029 	 * (region 2 and 3) to the VM, while exposing the rest (termed as usable
1030 	 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
1031 	 *
1032 	 *               devmem (memlength)
1033 	 * |-------------------------------------------------|
1034 	 * |                                           |
1035 	 * usemem.memphys                              resmem.memphys
1036 	 *
1037 	 * This hardware bug is fixed on the Grace Blackwell platforms and the
1038 	 * presence of the bug can be determined through nvdev->has_mig_hw_bug.
1039 	 * Thus on systems with the hardware fix, there is no need to partition
1040 	 * the GPU device memory and the entire memory is usable and mapped as
1041 	 * NORMAL cached (i.e. resmem size is 0).
1042 	 */
1043 	if (nvdev->has_mig_hw_bug)
1044 		resmem_size = SZ_1G;
1045 
1046 	nvdev->usemem.memphys = memphys;
1047 
1048 	/*
1049 	 * The device memory exposed to the VM is added to the kernel by the
1050 	 * VM driver module in chunks of memory block size. Note that only the
1051 	 * usable memory (usemem) is added to the kernel for usage by the VM
1052 	 * workloads.
1053 	 */
1054 	if (check_sub_overflow(memlength, resmem_size,
1055 			       &nvdev->usemem.memlength)) {
1056 		ret = -EOVERFLOW;
1057 		goto done;
1058 	}
1059 
1060 	/*
1061 	 * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
1062 	 * Calculate and save the BAR size for the region.
1063 	 */
1064 	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
1065 
1066 	/*
1067 	 * If the hardware has the fix for MIG, there is no requirement
1068 	 * for splitting the device memory to create RESMEM. The entire
1069 	 * device memory is usable and will be USEMEM. Return here for
1070 	 * such case.
1071 	 */
1072 	if (!nvdev->has_mig_hw_bug)
1073 		goto done;
1074 
1075 	/*
1076 	 * When the device memory is split to workaround the MIG bug on
1077 	 * Grace Hopper, the USEMEM part of the device memory has to be
1078 	 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
1079 	 * GPU FW and VFIO driver. The VM device driver is also aware of it
1080 	 * and make use of the value for its calculation to determine USEMEM
1081 	 * size. Note that the device memory may not be 512M aligned.
1082 	 */
1083 	nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
1084 					     MEMBLK_SIZE);
1085 	if (nvdev->usemem.memlength == 0) {
1086 		ret = -EINVAL;
1087 		goto done;
1088 	}
1089 
1090 	if ((check_add_overflow(nvdev->usemem.memphys,
1091 				nvdev->usemem.memlength,
1092 				&nvdev->resmem.memphys)) ||
1093 	    (check_sub_overflow(memlength, nvdev->usemem.memlength,
1094 				&nvdev->resmem.memlength))) {
1095 		ret = -EOVERFLOW;
1096 		goto done;
1097 	}
1098 
1099 	/*
1100 	 * The resmem region is exposed as a 64b BAR composed of region 2 and 3
1101 	 * for Grace Hopper. Calculate and save the BAR size for the region.
1102 	 */
1103 	nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
1104 done:
1105 	return ret;
1106 }
1107 
nvgrace_gpu_has_mig_hw_bug(struct pci_dev * pdev)1108 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
1109 {
1110 	int pcie_dvsec;
1111 	u16 dvsec_ctrl16;
1112 
1113 	pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
1114 					       GPU_CAP_DVSEC_REGISTER);
1115 
1116 	if (pcie_dvsec) {
1117 		pci_read_config_word(pdev,
1118 				     pcie_dvsec + DVSEC_BITMAP_OFFSET,
1119 				     &dvsec_ctrl16);
1120 
1121 		if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
1122 			return false;
1123 	}
1124 
1125 	return true;
1126 }
1127 
1128 /*
1129  * To reduce the system bootup time, the HBM training has
1130  * been moved out of the UEFI on the Grace-Blackwell systems.
1131  *
1132  * The onus of checking whether the HBM training has completed
1133  * thus falls on the module. The HBM training status can be
1134  * determined from a BAR0 register.
1135  *
1136  * Similarly, another BAR0 register exposes the status of the
1137  * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
1138  *
1139  * Poll these register and check for 30s. If the HBM training is
1140  * not complete or if the C2C link is not ready, fail the probe.
1141  *
1142  * While the wait is not required on Grace Hopper systems, it
1143  * is beneficial to make the check to ensure the device is in an
1144  * expected state.
1145  *
1146  * Ensure that the BAR0 region is enabled before accessing the
1147  * registers.
1148  */
nvgrace_gpu_probe_check_device_ready(struct pci_dev * pdev)1149 static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
1150 {
1151 	void __iomem *io;
1152 	int ret;
1153 
1154 	ret = pci_enable_device(pdev);
1155 	if (ret)
1156 		return ret;
1157 
1158 	ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
1159 	if (ret)
1160 		goto request_region_exit;
1161 
1162 	io = pci_iomap(pdev, 0, 0);
1163 	if (!io) {
1164 		ret = -ENOMEM;
1165 		goto iomap_exit;
1166 	}
1167 
1168 	ret = nvgrace_gpu_wait_device_ready(io);
1169 
1170 	pci_iounmap(pdev, io);
1171 iomap_exit:
1172 	pci_release_selected_regions(pdev, 1 << 0);
1173 request_region_exit:
1174 	pci_disable_device(pdev);
1175 	return ret;
1176 }
1177 
nvgrace_gpu_probe(struct pci_dev * pdev,const struct pci_device_id * id)1178 static int nvgrace_gpu_probe(struct pci_dev *pdev,
1179 			     const struct pci_device_id *id)
1180 {
1181 	const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
1182 	struct nvgrace_gpu_pci_core_device *nvdev;
1183 	u64 memphys, memlength;
1184 	int ret;
1185 
1186 	ret = nvgrace_gpu_probe_check_device_ready(pdev);
1187 	if (ret)
1188 		return ret;
1189 
1190 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
1191 	if (!ret)
1192 		ops = &nvgrace_gpu_pci_ops;
1193 
1194 	nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
1195 				  &pdev->dev, ops);
1196 	if (IS_ERR(nvdev))
1197 		return PTR_ERR(nvdev);
1198 
1199 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
1200 
1201 	if (ops == &nvgrace_gpu_pci_ops) {
1202 		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
1203 
1204 		/*
1205 		 * Device memory properties are identified in the host ACPI
1206 		 * table. Set the nvgrace_gpu_pci_core_device structure.
1207 		 */
1208 		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
1209 						    memphys, memlength);
1210 		if (ret)
1211 			goto out_put_vdev;
1212 		nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops;
1213 	} else {
1214 		nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops;
1215 	}
1216 
1217 	ret = vfio_pci_core_register_device(&nvdev->core_device);
1218 	if (ret)
1219 		goto out_put_vdev;
1220 
1221 	return ret;
1222 
1223 out_put_vdev:
1224 	vfio_put_device(&nvdev->core_device.vdev);
1225 	return ret;
1226 }
1227 
nvgrace_gpu_remove(struct pci_dev * pdev)1228 static void nvgrace_gpu_remove(struct pci_dev *pdev)
1229 {
1230 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
1231 
1232 	vfio_pci_core_unregister_device(core_device);
1233 	vfio_put_device(&core_device->vdev);
1234 }
1235 
1236 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
1237 	/* GH200 120GB */
1238 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
1239 	/* GH200 480GB */
1240 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
1241 	/* GH200 SKU */
1242 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
1243 	/* GB200 SKU */
1244 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
1245 	/* GB300 SKU */
1246 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) },
1247 	{}
1248 };
1249 
1250 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
1251 
1252 /*
1253  * The GPU reset is required to be serialized against the *first* mapping
1254  * faults and read/writes accesses to prevent potential RAS events logging.
1255  *
1256  * First fault or access after a reset needs to poll device readiness,
1257  * flag that a reset has occurred. The readiness test is done by holding
1258  * the memory_lock read lock and we expect all vfio-pci initiated resets to
1259  * hold the memory_lock write lock to avoid races. However, .reset_done
1260  * extends beyond the scope of vfio-pci initiated resets therefore we
1261  * cannot assert this behavior and use lockdep_assert_held_write.
1262  */
nvgrace_gpu_vfio_pci_reset_done(struct pci_dev * pdev)1263 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
1264 {
1265 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
1266 	struct nvgrace_gpu_pci_core_device *nvdev =
1267 		container_of(core_device, struct nvgrace_gpu_pci_core_device,
1268 			     core_device);
1269 
1270 	nvdev->reset_done = true;
1271 }
1272 
1273 static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = {
1274 	.reset_done = nvgrace_gpu_vfio_pci_reset_done,
1275 	.error_detected = vfio_pci_core_aer_err_detected,
1276 };
1277 
1278 static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
1279 	.name = KBUILD_MODNAME,
1280 	.id_table = nvgrace_gpu_vfio_pci_table,
1281 	.probe = nvgrace_gpu_probe,
1282 	.remove = nvgrace_gpu_remove,
1283 	.err_handler = &nvgrace_gpu_vfio_pci_err_handlers,
1284 	.driver_managed_dma = true,
1285 };
1286 
1287 module_pci_driver(nvgrace_gpu_vfio_pci_driver);
1288 
1289 MODULE_LICENSE("GPL");
1290 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
1291 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
1292 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
1293