xref: /linux/drivers/vfio/pci/nvgrace-gpu/main.c (revision 682ecb14e83840e87ea36c6d7c16c5111ce18784)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/bitfield.h>
7 #include <linux/sizes.h>
8 #include <linux/time64.h>
9 #include <linux/vfio_pci_core.h>
10 #include <linux/delay.h>
11 #include <linux/jiffies.h>
12 #include <linux/sched.h>
13 #include <linux/pci-p2pdma.h>
14 #include <linux/pm_runtime.h>
15 #include <linux/memory-failure.h>
16 
17 /*
18  * The device memory usable to the workloads running in the VM is cached
19  * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
20  * to the VM and is represented as usemem.
21  * Moreover, the VM GPU device driver needs a non-cacheable region to
22  * support the MIG feature. This region is also exposed as a 64b BAR
23  * (comprising of BAR2 and BAR3 region) and represented as resmem.
24  */
25 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
26 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
27 
28 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
29 #define MEMBLK_SIZE SZ_512M
30 
31 #define DVSEC_BITMAP_OFFSET 0xA
32 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
33 
34 #define GPU_CAP_DVSEC_REGISTER 3
35 
36 #define C2C_LINK_BAR0_OFFSET 0x1498
37 #define HBM_TRAINING_BAR0_OFFSET 0x200BC
38 #define STATUS_READY 0xFF
39 
40 #define POLL_QUANTUM_MS 1000
41 #define POLL_TIMEOUT_MS (30 * 1000)
42 
43 /*
44  * The state of the two device memory region - resmem and usemem - is
45  * saved as struct mem_region.
46  */
47 struct mem_region {
48 	phys_addr_t memphys;    /* Base physical address of the region */
49 	size_t memlength;       /* Region size */
50 	size_t bar_size;        /* Reported region BAR size */
51 	__le64 bar_val;         /* Emulated BAR offset registers */
52 	union {
53 		void *memaddr;
54 		void __iomem *ioaddr;
55 	};                      /* Base virtual address of the region */
56 	struct pfn_address_space pfn_address_space;
57 };
58 
59 struct nvgrace_gpu_pci_core_device {
60 	struct vfio_pci_core_device core_device;
61 	/* Cached and usable memory for the VM. */
62 	struct mem_region usemem;
63 	/* Non cached memory carved out from the end of device memory */
64 	struct mem_region resmem;
65 	/* Lock to control device memory kernel mapping */
66 	struct mutex remap_lock;
67 	void __iomem *bar0_base;
68 	bool has_mig_hw_bug;
69 	/* GPU has just been reset */
70 	bool reset_done;
71 	/* CXL Device DVSEC offset; 0 if not present (legacy GB path) */
72 	int cxl_dvsec;
73 };
74 
75 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
76 {
77 	struct nvgrace_gpu_pci_core_device *nvdev =
78 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
79 			     core_device.vdev);
80 
81 	nvdev->resmem.bar_val = 0;
82 	nvdev->usemem.bar_val = 0;
83 }
84 
85 /* Choose the structure corresponding to the fake BAR with a given index. */
86 static struct mem_region *
87 nvgrace_gpu_memregion(int index,
88 		      struct nvgrace_gpu_pci_core_device *nvdev)
89 {
90 	if (index == USEMEM_REGION_INDEX)
91 		return &nvdev->usemem;
92 
93 	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
94 		return &nvdev->resmem;
95 
96 	return NULL;
97 }
98 
99 static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
100 				unsigned int index,
101 				unsigned long pfn,
102 				pgoff_t *pfn_offset_in_region)
103 {
104 	struct mem_region *region;
105 	unsigned long start_pfn, num_pages;
106 
107 	region = nvgrace_gpu_memregion(index, nvdev);
108 	if (!region)
109 		return -EINVAL;
110 
111 	start_pfn = PHYS_PFN(region->memphys);
112 	num_pages = region->memlength >> PAGE_SHIFT;
113 
114 	if (pfn < start_pfn || pfn >= start_pfn + num_pages)
115 		return -EFAULT;
116 
117 	*pfn_offset_in_region = pfn - start_pfn;
118 
119 	return 0;
120 }
121 
122 static inline
123 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);
124 
125 static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
126 					unsigned long pfn,
127 					pgoff_t *pgoff)
128 {
129 	struct nvgrace_gpu_pci_core_device *nvdev;
130 	unsigned int index =
131 		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
132 	pgoff_t vma_offset_in_region = vma->vm_pgoff &
133 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
134 	pgoff_t pfn_offset_in_region;
135 	int ret;
136 
137 	nvdev = vma_to_nvdev(vma);
138 	if (!nvdev)
139 		return -ENOENT;
140 
141 	ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
142 	if (ret)
143 		return ret;
144 
145 	/* Ensure PFN is not before VMA's start within the region */
146 	if (pfn_offset_in_region < vma_offset_in_region)
147 		return -EFAULT;
148 
149 	/* Calculate offset from VMA start */
150 	*pgoff = vma->vm_pgoff +
151 		 (pfn_offset_in_region - vma_offset_in_region);
152 
153 	return 0;
154 }
155 
156 static int
157 nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
158 					struct mem_region *region)
159 {
160 	unsigned long pfn, nr_pages;
161 
162 	pfn = PHYS_PFN(region->memphys);
163 	nr_pages = region->memlength >> PAGE_SHIFT;
164 
165 	region->pfn_address_space.node.start = pfn;
166 	region->pfn_address_space.node.last = pfn + nr_pages - 1;
167 	region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
168 	region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
169 
170 	return register_pfn_address_space(&region->pfn_address_space);
171 }
172 
173 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
174 {
175 	struct vfio_pci_core_device *vdev =
176 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
177 	struct nvgrace_gpu_pci_core_device *nvdev =
178 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
179 			     core_device.vdev);
180 	void __iomem *io;
181 	int ret;
182 
183 	ret = vfio_pci_core_enable(vdev);
184 	if (ret)
185 		return ret;
186 
187 	if (nvdev->usemem.memlength) {
188 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
189 		mutex_init(&nvdev->remap_lock);
190 	}
191 
192 	/*
193 	 * GPU readiness is checked by reading the BAR0 registers.
194 	 * The BAR map was just set up by vfio_pci_core_enable(), so
195 	 * bail early if that wasn't successful:
196 	 */
197 	io = vfio_pci_core_get_iomap(vdev, 0);
198 	if (IS_ERR(io)) {
199 		ret = PTR_ERR(io);
200 		goto error_exit;
201 	}
202 
203 	if (nvdev->resmem.memlength) {
204 		ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
205 		if (ret && ret != -EOPNOTSUPP)
206 			goto error_exit;
207 	}
208 
209 	ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
210 	if (ret && ret != -EOPNOTSUPP)
211 		goto register_mem_failed;
212 
213 	vfio_pci_core_finish_enable(vdev);
214 	nvdev->bar0_base = io;
215 
216 	return 0;
217 
218 register_mem_failed:
219 	if (nvdev->resmem.memlength)
220 		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
221 error_exit:
222 	vfio_pci_core_disable(vdev);
223 	return ret;
224 }
225 
226 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
227 {
228 	struct nvgrace_gpu_pci_core_device *nvdev =
229 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
230 			     core_device.vdev);
231 
232 	nvdev->bar0_base = NULL;
233 
234 	if (nvdev->resmem.memlength)
235 		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
236 
237 	unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
238 
239 	/* Unmap the mapping to the device memory cached region */
240 	if (nvdev->usemem.memaddr) {
241 		memunmap(nvdev->usemem.memaddr);
242 		nvdev->usemem.memaddr = NULL;
243 	}
244 
245 	/* Unmap the mapping to the device memory non-cached region */
246 	if (nvdev->resmem.ioaddr) {
247 		iounmap(nvdev->resmem.ioaddr);
248 		nvdev->resmem.ioaddr = NULL;
249 	}
250 
251 	mutex_destroy(&nvdev->remap_lock);
252 
253 	vfio_pci_core_close_device(core_vdev);
254 }
255 
256 static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io)
257 {
258 	unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
259 
260 	do {
261 		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
262 		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY))
263 			return 0;
264 		if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS)))
265 			return -EINTR;
266 	} while (!time_after(jiffies, timeout));
267 
268 	return -ETIME;
269 }
270 
271 /*
272  * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low
273  * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2:
274  * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s,
275  * 101b-111b = reserved (clamped to 256s).
276  */
277 static inline unsigned long cxl_mem_active_timeout_ms(u8 timeout)
278 {
279 	return MSEC_PER_SEC << (2 * min_t(u8, timeout, 4));
280 }
281 
282 /*
283  * Check if CXL DVSEC reports memory as valid and active.
284  */
285 static inline bool cxl_dvsec_mem_is_active(u32 status)
286 {
287 	return (status & PCI_DVSEC_CXL_MEM_INFO_VALID) &&
288 	       (status & PCI_DVSEC_CXL_MEM_ACTIVE);
289 }
290 
291 static int nvgrace_gpu_test_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev,
292 					     u32 *status)
293 {
294 	struct pci_dev *pdev = nvdev->core_device.pdev;
295 	int cxl_dvsec = nvdev->cxl_dvsec;
296 	u32 val;
297 
298 	pci_read_config_dword(pdev,
299 			      cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
300 			      &val);
301 
302 	if (val == ~0U)
303 		return -ENODEV;
304 
305 	if (status)
306 		*status = val;
307 
308 	if (cxl_dvsec_mem_is_active(val))
309 		return 0;
310 
311 	return -EAGAIN;
312 }
313 
314 /*
315  * As per CXL spec r4.0 sec 8.1.3.8.2, MEM_INFO_VALID needs to be set
316  * within 1s and MEM_ACTIVE within Memory_Active_Timeout (up to ~256s)
317  * after reset and bootup.
318  */
319 static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev)
320 {
321 	unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS);
322 	bool active_phase = false;
323 	u32 status;
324 	int ret;
325 
326 	for (;;) {
327 		ret = nvgrace_gpu_test_device_ready_cxl(nvdev, &status);
328 		if (ret != -EAGAIN)
329 			return ret;
330 
331 		if (!active_phase && (status & PCI_DVSEC_CXL_MEM_INFO_VALID)) {
332 			u8 t = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT, status);
333 
334 			deadline = jiffies +
335 				   msecs_to_jiffies(cxl_mem_active_timeout_ms(t));
336 			active_phase = true;
337 		}
338 
339 		if (time_after(jiffies, deadline))
340 			return -ETIME;
341 
342 		if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS)))
343 			return -EINTR;
344 	}
345 }
346 
347 /*
348  * If the GPU memory is accessed by the CPU while the GPU is not ready
349  * after reset, it can cause harmless corrected RAS events to be logged.
350  * Make sure the GPU is ready before establishing the mappings.
351  *
352  * Since the CXL polling wait could take 256s, it happens outside
353  * memory_lock. Only do quick readiness check under the lock. Legacy
354  * keeps the in-lock poll.
355  */
356 static int
357 nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
358 {
359 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
360 	int ret;
361 
362 	lockdep_assert_held_read(&vdev->memory_lock);
363 
364 	if (!nvdev->reset_done)
365 		return 0;
366 
367 	if (!__vfio_pci_memory_enabled(vdev))
368 		return -EIO;
369 
370 	if (nvdev->cxl_dvsec)
371 		ret = nvgrace_gpu_test_device_ready_cxl(nvdev, NULL);
372 	else
373 		ret = nvgrace_gpu_wait_device_ready_legacy(nvdev->bar0_base);
374 	if (ret)
375 		return ret;
376 
377 	nvdev->reset_done = false;
378 
379 	return 0;
380 }
381 
382 static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
383 				   unsigned long addr)
384 {
385 	u64 pgoff = vma->vm_pgoff &
386 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
387 
388 	return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
389 }
390 
391 static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
392 						  unsigned int order)
393 {
394 	struct vm_area_struct *vma = vmf->vma;
395 	struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
396 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
397 	unsigned int index =
398 		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
399 	vm_fault_t ret = VM_FAULT_FALLBACK;
400 	struct mem_region *memregion;
401 	unsigned long pfn, addr;
402 
403 	memregion = nvgrace_gpu_memregion(index, nvdev);
404 	if (!memregion)
405 		return VM_FAULT_SIGBUS;
406 
407 	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
408 	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
409 
410 	if (is_aligned_for_order(vma, addr, pfn, order)) {
411 		/*
412 		 * Exit early under memory_lock to avoid a potentially lengthy
413 		 * device readiness wait on a runtime-suspended device. Any
414 		 * race after the lock is dropped is benign as the re-check
415 		 * inside the scoped guard below catches it.
416 		 */
417 		scoped_guard(rwsem_read, &vdev->memory_lock) {
418 			if (vdev->pm_runtime_engaged)
419 				return VM_FAULT_SIGBUS;
420 		}
421 
422 retry:
423 		if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) &&
424 		    nvgrace_gpu_wait_device_ready_cxl(nvdev))
425 			return VM_FAULT_SIGBUS;
426 
427 		scoped_guard(rwsem_read, &vdev->memory_lock) {
428 			int rc;
429 
430 			if (vdev->pm_runtime_engaged)
431 				return VM_FAULT_SIGBUS;
432 
433 			/* Re-run the wait if a reset raced us, not SIGBUS. */
434 			rc = nvgrace_gpu_check_device_ready(nvdev);
435 			if (rc == -EAGAIN)
436 				goto retry;
437 			if (rc)
438 				return VM_FAULT_SIGBUS;
439 
440 			ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
441 		}
442 	}
443 
444 	dev_dbg_ratelimited(&vdev->pdev->dev,
445 			    "%s order = %d pfn 0x%lx: 0x%x\n",
446 			    __func__, order, pfn,
447 			    (unsigned int)ret);
448 
449 	return ret;
450 }
451 
452 static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
453 {
454 	return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
455 }
456 
457 static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
458 	.fault = nvgrace_gpu_vfio_pci_fault,
459 #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
460 	.huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
461 #endif
462 };
463 
464 static inline
465 struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
466 {
467 	/* Check if this VMA belongs to us */
468 	if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
469 		return NULL;
470 
471 	return vma->vm_private_data;
472 }
473 
474 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
475 			    struct vm_area_struct *vma)
476 {
477 	struct nvgrace_gpu_pci_core_device *nvdev =
478 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
479 			     core_device.vdev);
480 	struct mem_region *memregion;
481 	u64 req_len, pgoff, end;
482 	unsigned int index;
483 
484 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
485 
486 	memregion = nvgrace_gpu_memregion(index, nvdev);
487 	if (!memregion)
488 		return vfio_pci_core_mmap(core_vdev, vma);
489 
490 	/*
491 	 * Request to mmap the BAR. Map to the CPU accessible memory on the
492 	 * GPU using the memory information gathered from the system ACPI
493 	 * tables.
494 	 */
495 	pgoff = vma->vm_pgoff &
496 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
497 
498 	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
499 	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
500 		return -EOVERFLOW;
501 
502 	/*
503 	 * Check that the mapping request does not go beyond the exposed
504 	 * device memory size.
505 	 */
506 	if (end > memregion->memlength)
507 		return -EINVAL;
508 
509 	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
510 
511 	/*
512 	 * The carved out region of the device memory needs the NORMAL_NC
513 	 * property. Communicate as such to the hypervisor.
514 	 */
515 	if (index == RESMEM_REGION_INDEX) {
516 		/*
517 		 * The nvgrace-gpu module has no issues with uncontained
518 		 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
519 		 * set to communicate to the KVM to S2 map as NORMAL_NC.
520 		 * This opens up guest usage of NORMAL_NC for this mapping.
521 		 */
522 		vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
523 
524 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
525 	}
526 
527 	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
528 	vma->vm_private_data = nvdev;
529 
530 	return 0;
531 }
532 
533 static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
534 					     struct vfio_region_info *info,
535 					     struct vfio_info_cap *caps)
536 {
537 	struct nvgrace_gpu_pci_core_device *nvdev =
538 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
539 			     core_device.vdev);
540 	struct vfio_region_info_cap_sparse_mmap *sparse;
541 	struct mem_region *memregion;
542 	u32 size;
543 	int ret;
544 
545 	/*
546 	 * Request to determine the BAR region information. Send the
547 	 * GPU memory information.
548 	 */
549 	memregion = nvgrace_gpu_memregion(info->index, nvdev);
550 	if (!memregion)
551 		return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
552 
553 	size = struct_size(sparse, areas, 1);
554 
555 	/*
556 	 * Setup for sparse mapping for the device memory. Only the
557 	 * available device memory on the hardware is shown as a
558 	 * mappable region.
559 	 */
560 	sparse = kzalloc(size, GFP_KERNEL);
561 	if (!sparse)
562 		return -ENOMEM;
563 
564 	sparse->nr_areas = 1;
565 	sparse->areas[0].offset = 0;
566 	sparse->areas[0].size = memregion->memlength;
567 	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
568 	sparse->header.version = 1;
569 
570 	ret = vfio_info_add_capability(caps, &sparse->header, size);
571 	kfree(sparse);
572 	if (ret)
573 		return ret;
574 
575 	info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
576 	/*
577 	 * The region memory size may not be power-of-2 aligned.
578 	 * Given that the memory is a BAR and may not be
579 	 * aligned, roundup to the next power-of-2.
580 	 */
581 	info->size = memregion->bar_size;
582 	info->flags = VFIO_REGION_INFO_FLAG_READ |
583 		     VFIO_REGION_INFO_FLAG_WRITE |
584 		     VFIO_REGION_INFO_FLAG_MMAP;
585 	return 0;
586 }
587 
588 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
589 			      unsigned int cmd, unsigned long arg)
590 {
591 	switch (cmd) {
592 	case VFIO_DEVICE_IOEVENTFD:
593 		return -ENOTTY;
594 	case VFIO_DEVICE_RESET:
595 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
596 		fallthrough;
597 	default:
598 		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
599 	}
600 }
601 
602 static __le64
603 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
604 {
605 	u64 tmp_val;
606 
607 	tmp_val = le64_to_cpu(val64);
608 	tmp_val &= ~(bar_size - 1);
609 	tmp_val |= flags;
610 
611 	return cpu_to_le64(tmp_val);
612 }
613 
614 /*
615  * Both the usable (usemem) and the reserved (resmem) device memory region
616  * are exposed as a 64b fake device BARs in the VM. These fake BARs must
617  * respond to the accesses on their respective PCI config space offsets.
618  *
619  * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
620  * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
621  */
622 static ssize_t
623 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
624 			    char __user *buf, size_t count, loff_t *ppos)
625 {
626 	struct nvgrace_gpu_pci_core_device *nvdev =
627 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
628 			     core_device.vdev);
629 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
630 	struct mem_region *memregion = NULL;
631 	__le64 val64;
632 	size_t register_offset;
633 	loff_t copy_offset;
634 	size_t copy_count;
635 	int ret;
636 
637 	ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
638 	if (ret < 0)
639 		return ret;
640 
641 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
642 						sizeof(val64),
643 						&copy_offset, &copy_count,
644 						&register_offset))
645 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
646 	else if (vfio_pci_core_range_intersect_range(pos, count,
647 						     PCI_BASE_ADDRESS_4,
648 						     sizeof(val64),
649 						     &copy_offset, &copy_count,
650 						     &register_offset))
651 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
652 
653 	if (memregion) {
654 		val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
655 						   PCI_BASE_ADDRESS_MEM_TYPE_64 |
656 						   PCI_BASE_ADDRESS_MEM_PREFETCH,
657 						   memregion->bar_val);
658 		if (copy_to_user(buf + copy_offset,
659 				 (void *)&val64 + register_offset, copy_count)) {
660 			/*
661 			 * The position has been incremented in
662 			 * vfio_pci_core_read. Reset the offset back to the
663 			 * starting position.
664 			 */
665 			*ppos -= count;
666 			return -EFAULT;
667 		}
668 	}
669 
670 	return count;
671 }
672 
673 static ssize_t
674 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
675 			     const char __user *buf, size_t count, loff_t *ppos)
676 {
677 	struct nvgrace_gpu_pci_core_device *nvdev =
678 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
679 			     core_device.vdev);
680 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
681 	struct mem_region *memregion = NULL;
682 	size_t register_offset;
683 	loff_t copy_offset;
684 	size_t copy_count;
685 
686 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
687 						sizeof(u64), &copy_offset,
688 						&copy_count, &register_offset))
689 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
690 	else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
691 						     sizeof(u64), &copy_offset,
692 						     &copy_count, &register_offset))
693 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
694 
695 	if (memregion) {
696 		if (copy_from_user((void *)&memregion->bar_val + register_offset,
697 				   buf + copy_offset, copy_count))
698 			return -EFAULT;
699 		*ppos += copy_count;
700 		return copy_count;
701 	}
702 
703 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
704 }
705 
706 /*
707  * Ad hoc map the device memory in the module kernel VA space. Primarily needed
708  * as vfio does not require the userspace driver to only perform accesses through
709  * mmaps of the vfio-pci BAR regions and such accesses should be supported using
710  * vfio_device_ops read/write implementations.
711  *
712  * The usemem region is cacheable memory and hence is memremaped.
713  * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
714  */
715 static int
716 nvgrace_gpu_map_device_mem(int index,
717 			   struct nvgrace_gpu_pci_core_device *nvdev)
718 {
719 	struct mem_region *memregion;
720 	int ret = 0;
721 
722 	memregion = nvgrace_gpu_memregion(index, nvdev);
723 	if (!memregion)
724 		return -EINVAL;
725 
726 	mutex_lock(&nvdev->remap_lock);
727 
728 	if (memregion->memaddr)
729 		goto unlock;
730 
731 	if (index == USEMEM_REGION_INDEX)
732 		memregion->memaddr = memremap(memregion->memphys,
733 					      memregion->memlength,
734 					      MEMREMAP_WB);
735 	else
736 		memregion->ioaddr = ioremap_wc(memregion->memphys,
737 					       memregion->memlength);
738 
739 	if (!memregion->memaddr)
740 		ret = -ENOMEM;
741 
742 unlock:
743 	mutex_unlock(&nvdev->remap_lock);
744 
745 	return ret;
746 }
747 
748 /*
749  * Read the data from the device memory (mapped either through ioremap
750  * or memremap) into the user buffer.
751  */
752 static int
753 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
754 			 char __user *buf, size_t mem_count, loff_t *ppos)
755 {
756 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
757 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
758 	int ret;
759 
760 	if (!mem_count)
761 		return 0;
762 
763 	/*
764 	 * Handle read on the BAR regions. Map to the target device memory
765 	 * physical address and copy to the request read buffer.
766 	 */
767 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
768 	if (ret)
769 		return ret;
770 
771 	if (index == USEMEM_REGION_INDEX) {
772 		if (copy_to_user(buf,
773 				 (u8 *)nvdev->usemem.memaddr + offset,
774 				 mem_count))
775 			ret = -EFAULT;
776 	} else {
777 		/*
778 		 * The hardware ensures that the system does not crash when
779 		 * the device memory is accessed with the memory enable
780 		 * turned off. It synthesizes ~0 on such read. So there is
781 		 * no need to check or support the disablement/enablement of
782 		 * BAR through PCI_COMMAND config space register. Pass
783 		 * test_mem flag as false.
784 		 */
785 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
786 					     nvdev->resmem.ioaddr,
787 					     buf, offset, mem_count,
788 					     0, 0, false, VFIO_PCI_IO_WIDTH_8);
789 	}
790 
791 	return ret;
792 }
793 
794 /*
795  * Read count bytes from the device memory at an offset. The actual device
796  * memory size (available) may not be a power-of-2. So the driver fakes
797  * the size to a power-of-2 (reported) when exposing to a user space driver.
798  *
799  * Reads starting beyond the reported size generate -EINVAL; reads extending
800  * beyond the actual device size is filled with ~0; reads extending beyond
801  * the reported size are truncated.
802  */
803 static ssize_t
804 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
805 		     char __user *buf, size_t count, loff_t *ppos)
806 {
807 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
808 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
809 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
810 	struct mem_region *memregion;
811 	size_t mem_count, i;
812 	u8 val = 0xFF;
813 	int ret;
814 
815 	/* No need to do NULL check as caller does. */
816 	memregion = nvgrace_gpu_memregion(index, nvdev);
817 
818 	if (offset >= memregion->bar_size)
819 		return -EINVAL;
820 
821 	/* Clip short the read request beyond reported BAR size */
822 	count = min(count, memregion->bar_size - (size_t)offset);
823 
824 	/*
825 	 * Determine how many bytes to be actually read from the device memory.
826 	 * Read request beyond the actual device memory size is filled with ~0,
827 	 * while those beyond the actual reported size is skipped.
828 	 */
829 	if (offset >= memregion->memlength)
830 		mem_count = 0;
831 	else
832 		mem_count = min(count, memregion->memlength - (size_t)offset);
833 
834 	if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) {
835 		ret = nvgrace_gpu_wait_device_ready_cxl(nvdev);
836 		if (ret)
837 			return ret;
838 	}
839 
840 	scoped_guard(rwsem_read, &vdev->memory_lock) {
841 		ret = nvgrace_gpu_check_device_ready(nvdev);
842 		if (ret)
843 			return ret;
844 
845 		ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
846 		if (ret)
847 			return ret;
848 	}
849 
850 	/*
851 	 * Only the device memory present on the hardware is mapped, which may
852 	 * not be power-of-2 aligned. A read to an offset beyond the device memory
853 	 * size is filled with ~0.
854 	 */
855 	for (i = mem_count; i < count; i++) {
856 		ret = put_user(val, (unsigned char __user *)(buf + i));
857 		if (ret)
858 			return ret;
859 	}
860 
861 	*ppos += count;
862 	return count;
863 }
864 
865 static ssize_t
866 nvgrace_gpu_read(struct vfio_device *core_vdev,
867 		 char __user *buf, size_t count, loff_t *ppos)
868 {
869 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
870 	struct nvgrace_gpu_pci_core_device *nvdev =
871 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
872 			     core_device.vdev);
873 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
874 	int ret;
875 
876 	if (nvgrace_gpu_memregion(index, nvdev)) {
877 		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
878 			return -EIO;
879 		ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
880 		pm_runtime_put(&vdev->pdev->dev);
881 		return ret;
882 	}
883 
884 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
885 		return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
886 
887 	return vfio_pci_core_read(core_vdev, buf, count, ppos);
888 }
889 
890 /*
891  * Write the data to the device memory (mapped either through ioremap
892  * or memremap) from the user buffer.
893  */
894 static int
895 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
896 			  const char __user *buf, size_t mem_count,
897 			  loff_t *ppos)
898 {
899 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
900 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
901 	int ret;
902 
903 	if (!mem_count)
904 		return 0;
905 
906 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
907 	if (ret)
908 		return ret;
909 
910 	if (index == USEMEM_REGION_INDEX) {
911 		if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
912 				   buf, mem_count))
913 			return -EFAULT;
914 	} else {
915 		/*
916 		 * The hardware ensures that the system does not crash when
917 		 * the device memory is accessed with the memory enable
918 		 * turned off. It drops such writes. So there is no need to
919 		 * check or support the disablement/enablement of BAR
920 		 * through PCI_COMMAND config space register. Pass test_mem
921 		 * flag as false.
922 		 */
923 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
924 					     nvdev->resmem.ioaddr,
925 					     (char __user *)buf, pos, mem_count,
926 					     0, 0, true, VFIO_PCI_IO_WIDTH_8);
927 	}
928 
929 	return ret;
930 }
931 
932 /*
933  * Write count bytes to the device memory at a given offset. The actual device
934  * memory size (available) may not be a power-of-2. So the driver fakes the
935  * size to a power-of-2 (reported) when exposing to a user space driver.
936  *
937  * Writes extending beyond the reported size are truncated; writes starting
938  * beyond the reported size generate -EINVAL.
939  */
940 static ssize_t
941 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
942 		      size_t count, loff_t *ppos, const char __user *buf)
943 {
944 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
945 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
946 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
947 	struct mem_region *memregion;
948 	size_t mem_count;
949 	int ret = 0;
950 
951 	/* No need to do NULL check as caller does. */
952 	memregion = nvgrace_gpu_memregion(index, nvdev);
953 
954 	if (offset >= memregion->bar_size)
955 		return -EINVAL;
956 
957 	/* Clip short the write request beyond reported BAR size */
958 	count = min(count, memregion->bar_size - (size_t)offset);
959 
960 	/*
961 	 * Determine how many bytes to be actually written to the device memory.
962 	 * Do not write to the offset beyond available size.
963 	 */
964 	if (offset >= memregion->memlength)
965 		goto exitfn;
966 
967 	/*
968 	 * Only the device memory present on the hardware is mapped, which may
969 	 * not be power-of-2 aligned. Drop access outside the available device
970 	 * memory on the hardware.
971 	 */
972 	mem_count = min(count, memregion->memlength - (size_t)offset);
973 
974 	if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) {
975 		ret = nvgrace_gpu_wait_device_ready_cxl(nvdev);
976 		if (ret)
977 			return ret;
978 	}
979 
980 	scoped_guard(rwsem_read, &vdev->memory_lock) {
981 		ret = nvgrace_gpu_check_device_ready(nvdev);
982 		if (ret)
983 			return ret;
984 
985 		ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
986 		if (ret)
987 			return ret;
988 	}
989 
990 exitfn:
991 	*ppos += count;
992 	return count;
993 }
994 
995 static ssize_t
996 nvgrace_gpu_write(struct vfio_device *core_vdev,
997 		  const char __user *buf, size_t count, loff_t *ppos)
998 {
999 	struct nvgrace_gpu_pci_core_device *nvdev =
1000 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
1001 			     core_device.vdev);
1002 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
1003 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1004 	int ret;
1005 
1006 	if (nvgrace_gpu_memregion(index, nvdev)) {
1007 		if (pm_runtime_resume_and_get(&vdev->pdev->dev))
1008 			return -EIO;
1009 		ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
1010 		pm_runtime_put(&vdev->pdev->dev);
1011 		return ret;
1012 	}
1013 
1014 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
1015 		return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
1016 
1017 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
1018 }
1019 
1020 static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev,
1021 				   struct p2pdma_provider **provider,
1022 				   unsigned int region_index,
1023 				   struct phys_vec *phys_vec,
1024 				   struct vfio_region_dma_range *dma_ranges,
1025 				   size_t nr_ranges)
1026 {
1027 	struct nvgrace_gpu_pci_core_device *nvdev = container_of(
1028 		core_vdev, struct nvgrace_gpu_pci_core_device, core_device);
1029 	struct pci_dev *pdev = core_vdev->pdev;
1030 	struct mem_region *mem_region;
1031 
1032 	/*
1033 	 * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) {
1034 	 * 	The P2P properties of the non-BAR memory is the same as the
1035 	 * 	BAR memory, so just use the provider for index 0. Someday
1036 	 * 	when CXL gets P2P support we could create CXLish providers
1037 	 * 	for the non-BAR memory.
1038 	 * } else if (region_index == USEMEM_REGION_INDEX) {
1039 	 * 	This is actually cachable memory and isn't treated as P2P in
1040 	 * 	the chip. For now we have no way to push cachable memory
1041 	 * 	through everything and the Grace HW doesn't care what caching
1042 	 * 	attribute is programmed into the SMMU. So use BAR 0.
1043 	 * }
1044 	 */
1045 	mem_region = nvgrace_gpu_memregion(region_index, nvdev);
1046 	if (mem_region) {
1047 		*provider = pcim_p2pdma_provider(pdev, 0);
1048 		if (!*provider)
1049 			return -EINVAL;
1050 		return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges,
1051 						   nr_ranges,
1052 						   mem_region->memphys,
1053 						   mem_region->memlength);
1054 	}
1055 
1056 	return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index,
1057 					     phys_vec, dma_ranges, nr_ranges);
1058 }
1059 
1060 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = {
1061 	.get_dmabuf_phys = nvgrace_get_dmabuf_phys,
1062 };
1063 
1064 static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
1065 	.name		= "nvgrace-gpu-vfio-pci",
1066 	.init		= vfio_pci_core_init_dev,
1067 	.release	= vfio_pci_core_release_dev,
1068 	.open_device	= nvgrace_gpu_open_device,
1069 	.close_device	= nvgrace_gpu_close_device,
1070 	.ioctl		= nvgrace_gpu_ioctl,
1071 	.get_region_info_caps = nvgrace_gpu_ioctl_get_region_info,
1072 	.device_feature	= vfio_pci_core_ioctl_feature,
1073 	.read		= nvgrace_gpu_read,
1074 	.write		= nvgrace_gpu_write,
1075 	.mmap		= nvgrace_gpu_mmap,
1076 	.request	= vfio_pci_core_request,
1077 	.match		= vfio_pci_core_match,
1078 	.match_token_uuid = vfio_pci_core_match_token_uuid,
1079 	.bind_iommufd	= vfio_iommufd_physical_bind,
1080 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
1081 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
1082 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
1083 };
1084 
1085 static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = {
1086 	.get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys,
1087 };
1088 
1089 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
1090 	.name		= "nvgrace-gpu-vfio-pci-core",
1091 	.init		= vfio_pci_core_init_dev,
1092 	.release	= vfio_pci_core_release_dev,
1093 	.open_device	= nvgrace_gpu_open_device,
1094 	.close_device	= vfio_pci_core_close_device,
1095 	.ioctl		= vfio_pci_core_ioctl,
1096 	.get_region_info_caps = vfio_pci_ioctl_get_region_info,
1097 	.device_feature	= vfio_pci_core_ioctl_feature,
1098 	.read		= vfio_pci_core_read,
1099 	.write		= vfio_pci_core_write,
1100 	.mmap		= vfio_pci_core_mmap,
1101 	.request	= vfio_pci_core_request,
1102 	.match		= vfio_pci_core_match,
1103 	.match_token_uuid = vfio_pci_core_match_token_uuid,
1104 	.bind_iommufd	= vfio_iommufd_physical_bind,
1105 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
1106 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
1107 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
1108 };
1109 
1110 static int
1111 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
1112 				  u64 *pmemphys, u64 *pmemlength)
1113 {
1114 	int ret;
1115 
1116 	/*
1117 	 * The memory information is present in the system ACPI tables as DSD
1118 	 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
1119 	 */
1120 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
1121 				       pmemphys);
1122 	if (ret)
1123 		return ret;
1124 
1125 	if (*pmemphys > type_max(phys_addr_t))
1126 		return -EOVERFLOW;
1127 
1128 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
1129 				       pmemlength);
1130 	if (ret)
1131 		return ret;
1132 
1133 	if (*pmemlength > type_max(size_t))
1134 		return -EOVERFLOW;
1135 
1136 	/*
1137 	 * If the C2C link is not up due to an error, the coherent device
1138 	 * memory size is returned as 0. Fail in such case.
1139 	 */
1140 	if (*pmemlength == 0)
1141 		return -ENOMEM;
1142 
1143 	return ret;
1144 }
1145 
1146 static int
1147 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
1148 			      struct nvgrace_gpu_pci_core_device *nvdev,
1149 			      u64 memphys, u64 memlength)
1150 {
1151 	int ret = 0;
1152 	u64 resmem_size = 0;
1153 
1154 	/*
1155 	 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
1156 	 * region to support the MIG feature owing to a hardware bug. Since the
1157 	 * device memory is mapped as NORMAL cached, carve out a region from the end
1158 	 * with a different NORMAL_NC property (called as reserved memory and
1159 	 * represented as resmem). This region then is exposed as a 64b BAR
1160 	 * (region 2 and 3) to the VM, while exposing the rest (termed as usable
1161 	 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
1162 	 *
1163 	 *               devmem (memlength)
1164 	 * |-------------------------------------------------|
1165 	 * |                                           |
1166 	 * usemem.memphys                              resmem.memphys
1167 	 *
1168 	 * This hardware bug is fixed on the Grace Blackwell platforms and the
1169 	 * presence of the bug can be determined through nvdev->has_mig_hw_bug.
1170 	 * Thus on systems with the hardware fix, there is no need to partition
1171 	 * the GPU device memory and the entire memory is usable and mapped as
1172 	 * NORMAL cached (i.e. resmem size is 0).
1173 	 */
1174 	if (nvdev->has_mig_hw_bug)
1175 		resmem_size = SZ_1G;
1176 
1177 	nvdev->usemem.memphys = memphys;
1178 
1179 	/*
1180 	 * The device memory exposed to the VM is added to the kernel by the
1181 	 * VM driver module in chunks of memory block size. Note that only the
1182 	 * usable memory (usemem) is added to the kernel for usage by the VM
1183 	 * workloads.
1184 	 */
1185 	if (check_sub_overflow(memlength, resmem_size,
1186 			       &nvdev->usemem.memlength)) {
1187 		ret = -EOVERFLOW;
1188 		goto done;
1189 	}
1190 
1191 	/*
1192 	 * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
1193 	 * Calculate and save the BAR size for the region.
1194 	 */
1195 	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
1196 
1197 	/*
1198 	 * If the hardware has the fix for MIG, there is no requirement
1199 	 * for splitting the device memory to create RESMEM. The entire
1200 	 * device memory is usable and will be USEMEM. Return here for
1201 	 * such case.
1202 	 */
1203 	if (!nvdev->has_mig_hw_bug)
1204 		goto done;
1205 
1206 	/*
1207 	 * When the device memory is split to workaround the MIG bug on
1208 	 * Grace Hopper, the USEMEM part of the device memory has to be
1209 	 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
1210 	 * GPU FW and VFIO driver. The VM device driver is also aware of it
1211 	 * and make use of the value for its calculation to determine USEMEM
1212 	 * size. Note that the device memory may not be 512M aligned.
1213 	 */
1214 	nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
1215 					     MEMBLK_SIZE);
1216 	if (nvdev->usemem.memlength == 0) {
1217 		ret = -EINVAL;
1218 		goto done;
1219 	}
1220 
1221 	if ((check_add_overflow(nvdev->usemem.memphys,
1222 				nvdev->usemem.memlength,
1223 				&nvdev->resmem.memphys)) ||
1224 	    (check_sub_overflow(memlength, nvdev->usemem.memlength,
1225 				&nvdev->resmem.memlength))) {
1226 		ret = -EOVERFLOW;
1227 		goto done;
1228 	}
1229 
1230 	/*
1231 	 * The resmem region is exposed as a 64b BAR composed of region 2 and 3
1232 	 * for Grace Hopper. Calculate and save the BAR size for the region.
1233 	 */
1234 	nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
1235 done:
1236 	return ret;
1237 }
1238 
1239 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
1240 {
1241 	int pcie_dvsec;
1242 	u16 dvsec_ctrl16;
1243 
1244 	pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
1245 					       GPU_CAP_DVSEC_REGISTER);
1246 
1247 	if (pcie_dvsec) {
1248 		pci_read_config_word(pdev,
1249 				     pcie_dvsec + DVSEC_BITMAP_OFFSET,
1250 				     &dvsec_ctrl16);
1251 
1252 		if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
1253 			return false;
1254 	}
1255 
1256 	return true;
1257 }
1258 
1259 /*
1260  * To reduce the system bootup time, the HBM training has
1261  * been moved out of the UEFI on the Grace-Blackwell systems.
1262  *
1263  * The onus of checking whether the HBM training has completed
1264  * thus falls on the module. The HBM training status can be
1265  * determined from a BAR0 register.
1266  *
1267  * Similarly, another BAR0 register exposes the status of the
1268  * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
1269  *
1270  * Poll these register and check for 30s. If the HBM training is
1271  * not complete or if the C2C link is not ready, fail the probe.
1272  *
1273  * While the wait is not required on Grace Hopper systems, it
1274  * is beneficial to make the check to ensure the device is in an
1275  * expected state.
1276  *
1277  * On Blackwell-Next systems, memory readiness is determined via the
1278  * CXL Device DVSEC in PCI config space and does not require BAR0.
1279  * For the legacy path, ensure BAR0 is enabled before accessing the
1280  * registers.
1281  */
1282 static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
1283 {
1284 	struct pci_dev *pdev = nvdev->core_device.pdev;
1285 	void __iomem *io;
1286 	int ret;
1287 
1288 	/*
1289 	 * Note that the worst-case wait here is ~256s (vs ~30s on the
1290 	 * legacy path) and may block device unbind/sysfs for the duration.
1291 	 */
1292 	if (nvdev->cxl_dvsec)
1293 		return nvgrace_gpu_wait_device_ready_cxl(nvdev);
1294 
1295 	ret = pci_enable_device(pdev);
1296 	if (ret)
1297 		return ret;
1298 
1299 	ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
1300 	if (ret)
1301 		goto request_region_exit;
1302 
1303 	io = pci_iomap(pdev, 0, 0);
1304 	if (!io) {
1305 		ret = -ENOMEM;
1306 		goto iomap_exit;
1307 	}
1308 
1309 	ret = nvgrace_gpu_wait_device_ready_legacy(io);
1310 
1311 	pci_iounmap(pdev, io);
1312 iomap_exit:
1313 	pci_release_selected_regions(pdev, 1 << 0);
1314 request_region_exit:
1315 	pci_disable_device(pdev);
1316 	return ret;
1317 }
1318 
1319 static int nvgrace_gpu_probe(struct pci_dev *pdev,
1320 			     const struct pci_device_id *id)
1321 {
1322 	const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
1323 	struct nvgrace_gpu_pci_core_device *nvdev;
1324 	u64 memphys, memlength;
1325 	int ret;
1326 
1327 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
1328 	if (!ret)
1329 		ops = &nvgrace_gpu_pci_ops;
1330 
1331 	nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
1332 				  &pdev->dev, ops);
1333 	if (IS_ERR(nvdev))
1334 		return PTR_ERR(nvdev);
1335 
1336 	nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
1337 						     PCI_DVSEC_CXL_DEVICE);
1338 
1339 	ret = nvgrace_gpu_probe_check_device_ready(nvdev);
1340 	if (ret)
1341 		goto out_put_vdev;
1342 
1343 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
1344 
1345 	if (ops == &nvgrace_gpu_pci_ops) {
1346 		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
1347 
1348 		/*
1349 		 * Device memory properties are identified in the host ACPI
1350 		 * table. Set the nvgrace_gpu_pci_core_device structure.
1351 		 */
1352 		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
1353 						    memphys, memlength);
1354 		if (ret)
1355 			goto out_put_vdev;
1356 		nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops;
1357 	} else {
1358 		nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops;
1359 	}
1360 
1361 	ret = vfio_pci_core_register_device(&nvdev->core_device);
1362 	if (ret)
1363 		goto out_put_vdev;
1364 
1365 	return ret;
1366 
1367 out_put_vdev:
1368 	vfio_put_device(&nvdev->core_device.vdev);
1369 	return ret;
1370 }
1371 
1372 static void nvgrace_gpu_remove(struct pci_dev *pdev)
1373 {
1374 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
1375 
1376 	vfio_pci_core_unregister_device(core_device);
1377 	vfio_put_device(&core_device->vdev);
1378 }
1379 
1380 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
1381 	/* GH200 120GB */
1382 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
1383 	/* GH200 480GB */
1384 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
1385 	/* GH200 SKU */
1386 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
1387 	/* GB200 SKU */
1388 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
1389 	/* GB300 SKU */
1390 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) },
1391 	{}
1392 };
1393 
1394 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
1395 
1396 /*
1397  * The GPU reset is required to be serialized against the *first* mapping
1398  * faults and read/writes accesses to prevent potential RAS events logging.
1399  *
1400  * First fault or access after a reset needs to poll device readiness,
1401  * flag that a reset has occurred. The readiness test is done by holding
1402  * the memory_lock read lock and we expect all vfio-pci initiated resets to
1403  * hold the memory_lock write lock to avoid races. However, .reset_done
1404  * extends beyond the scope of vfio-pci initiated resets therefore we
1405  * cannot assert this behavior and use lockdep_assert_held_write.
1406  */
1407 static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
1408 {
1409 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
1410 	struct nvgrace_gpu_pci_core_device *nvdev =
1411 		container_of(core_device, struct nvgrace_gpu_pci_core_device,
1412 			     core_device);
1413 
1414 	nvdev->reset_done = true;
1415 }
1416 
1417 static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = {
1418 	.reset_done = nvgrace_gpu_vfio_pci_reset_done,
1419 	.error_detected = vfio_pci_core_aer_err_detected,
1420 };
1421 
1422 static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
1423 	.name = KBUILD_MODNAME,
1424 	.id_table = nvgrace_gpu_vfio_pci_table,
1425 	.probe = nvgrace_gpu_probe,
1426 	.remove = nvgrace_gpu_remove,
1427 	.err_handler = &nvgrace_gpu_vfio_pci_err_handlers,
1428 	.driver_managed_dma = true,
1429 };
1430 
1431 module_pci_driver(nvgrace_gpu_vfio_pci_driver);
1432 
1433 MODULE_LICENSE("GPL");
1434 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
1435 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
1436 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
1437