xref: /linux/drivers/accel/habanalabs/common/hldio.c (revision 58809f614e0e3f4e12b489bddf680bfeb31c0a20)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2024 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  */
7 
8 #include "habanalabs.h"
9 #include "hldio.h"
10 #include <generated/uapi/linux/version.h>
11 #include <linux/pci-p2pdma.h>
12 #include <linux/blkdev.h>
13 #include <linux/vmalloc.h>
14 
15 /*
16  * NVMe Direct I/O implementation for habanalabs driver
17  *
18  * ASSUMPTIONS
19  * ===========
20  * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
21  * 2. Only READ operations (can extend in the future).
22  * 3. No sparse files (can overcome this in the future).
23  * 4. Kernel version >= 6.9
24  * 5. Requiring page alignment is OK (I don't see a solution to this one right,
25  *    now, how do we read partial pages?)
26  * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
27  *    Theoretically I have a slight idea on how this could be solvable, but it
28  *    is probably inacceptable for the upstream. Also may not work in the end.
29  * 7. Either make sure our cards and disks are under the same PCI bridge, or
30  *    compile a custom kernel to hack around this.
31  */
32 
33 #define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */
34 
35 /*
36  * This struct contains all the useful data I could milk out of the file handle
37  * provided by the user.
38  * @TODO: right now it is retrieved on each IO, but can be done once with some
39  * dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
40  */
41 struct hl_dio_fd {
42 	/* Back pointer in case we need it in async completion */
43 	struct hl_ctx *ctx;
44 	/* Associated fd struct */
45 	struct file *filp;
46 };
47 
48 /*
49  * This is a single IO descriptor
50  */
51 struct hl_direct_io {
52 	struct hl_dio_fd f;
53 	struct kiocb kio;
54 	struct bio_vec *bv;
55 	struct iov_iter iter;
56 	u64 device_va;
57 	u64 off_bytes;
58 	u64 len_bytes;
59 	u32 type;
60 };
61 
62 bool hl_device_supports_nvme(struct hl_device *hdev)
63 {
64 	return hdev->asic_prop.supports_nvme;
65 }
66 
67 static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)
68 {
69 	struct hl_device *hdev = ctx->hdev;
70 	struct block_device *bd;
71 	struct super_block *sb;
72 	struct inode *inode;
73 	struct gendisk *gd;
74 	struct device *disk_dev;
75 	int rc;
76 
77 	f->filp = fget(fd);
78 	if (!f->filp) {
79 		rc = -ENOENT;
80 		goto out;
81 	}
82 
83 	if (!(f->filp->f_flags & O_DIRECT)) {
84 		dev_err(hdev->dev, "file is not in the direct mode\n");
85 		rc = -EINVAL;
86 		goto fput;
87 	}
88 
89 	if (!f->filp->f_op->read_iter) {
90 		dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
91 		rc = -EINVAL;
92 		goto fput;
93 	}
94 
95 	inode = file_inode(f->filp);
96 	sb = inode->i_sb;
97 	bd = sb->s_bdev;
98 	gd = bd->bd_disk;
99 
100 	if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
101 		dev_err(hdev->dev, "sparse files are not currently supported\n");
102 		rc = -EINVAL;
103 		goto fput;
104 	}
105 
106 	if (!bd || !gd) {
107 		dev_err(hdev->dev, "invalid block device\n");
108 		rc = -ENODEV;
109 		goto fput;
110 	}
111 	/* Get the underlying device from the block device */
112 	disk_dev = disk_to_dev(gd);
113 	if (!dma_pci_p2pdma_supported(disk_dev)) {
114 		dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
115 		rc = -EOPNOTSUPP;
116 		goto fput;
117 	}
118 
119 	/*
120 	 * @TODO: Maybe we need additional checks here
121 	 */
122 
123 	f->ctx = ctx;
124 	rc = 0;
125 
126 	goto out;
127 fput:
128 	fput(f->filp);
129 out:
130 	return rc;
131 }
132 
133 static void hl_dio_fd_unregister(struct hl_dio_fd *f)
134 {
135 	fput(f->filp);
136 }
137 
138 static long hl_dio_count_io(struct hl_device *hdev)
139 {
140 	s64 sum = 0;
141 	int i;
142 
143 	for_each_possible_cpu(i)
144 		sum += per_cpu(*hdev->hldio.inflight_ios, i);
145 
146 	return sum;
147 }
148 
149 static bool hl_dio_get_iopath(struct hl_ctx *ctx)
150 {
151 	struct hl_device *hdev = ctx->hdev;
152 
153 	if (hdev->hldio.io_enabled) {
154 		this_cpu_inc(*hdev->hldio.inflight_ios);
155 
156 		/* Avoid race conditions */
157 		if (!hdev->hldio.io_enabled) {
158 			this_cpu_dec(*hdev->hldio.inflight_ios);
159 			return false;
160 		}
161 
162 		hl_ctx_get(ctx);
163 
164 		return true;
165 	}
166 
167 	return false;
168 }
169 
170 static void hl_dio_put_iopath(struct hl_ctx *ctx)
171 {
172 	struct hl_device *hdev = ctx->hdev;
173 
174 	hl_ctx_put(ctx);
175 	this_cpu_dec(*hdev->hldio.inflight_ios);
176 }
177 
178 static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
179 {
180 	hdev->hldio.io_enabled = enabled;
181 }
182 
183 static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)
184 {
185 	if ((u64)io->device_va & ~PAGE_MASK) {
186 		dev_dbg(hdev->dev, "device address must be 4K aligned\n");
187 		return false;
188 	}
189 
190 	if (io->len_bytes & ~PAGE_MASK) {
191 		dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
192 		return false;
193 	}
194 
195 	if (io->off_bytes & ~PAGE_MASK) {
196 		dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
197 		return false;
198 	}
199 
200 	return true;
201 }
202 
203 static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)
204 {
205 	struct hl_dio *hldio = &hdev->hldio;
206 	u64 device_pa;
207 	int rc, i;
208 
209 	rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
210 	if (rc) {
211 		dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
212 				device_va, rc);
213 		return NULL;
214 	}
215 
216 	for (i = 0 ; i < hldio->np2prs ; ++i) {
217 		if (device_pa >= hldio->p2prs[i].device_pa &&
218 		    device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
219 			return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
220 				PAGE_SHIFT];
221 	}
222 
223 	return NULL;
224 }
225 
226 static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)
227 {
228 	u64 npages, device_va;
229 	ssize_t rc;
230 	int i;
231 
232 	if (!hl_dio_validate_io(hdev, io))
233 		return -EINVAL;
234 
235 	if (!hl_dio_get_iopath(io->f.ctx)) {
236 		dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
237 		return -ESHUTDOWN;
238 	}
239 
240 	init_sync_kiocb(&io->kio, io->f.filp);
241 	io->kio.ki_pos = io->off_bytes;
242 
243 	npages = (io->len_bytes >> PAGE_SHIFT);
244 
245 	/* @TODO: this can be implemented smarter, vmalloc in iopath is not
246 	 * ideal. Maybe some variation of genpool. Number of pages may differ
247 	 * greatly, so maybe even use pools of different sizes and chose the
248 	 * closest one.
249 	 */
250 	io->bv = vzalloc(npages * sizeof(struct bio_vec));
251 	if (!io->bv)
252 		return -ENOMEM;
253 
254 	for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
255 		io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
256 		if (!io->bv[i].bv_page) {
257 			dev_err(hdev->dev, "error getting page struct for device va %#llx",
258 					device_va);
259 			rc = -EFAULT;
260 			goto cleanup;
261 		}
262 		io->bv[i].bv_offset = 0;
263 		io->bv[i].bv_len = PAGE_SIZE;
264 	}
265 
266 	iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
267 	if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
268 		rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
269 	else
270 		rc = -EINVAL;
271 
272 cleanup:
273 	vfree(io->bv);
274 	hl_dio_put_iopath(io->f.ctx);
275 
276 	dev_dbg(hdev->dev, "IO ended with %ld\n", rc);
277 
278 	return rc;
279 }
280 
281 /*
282  * @TODO: This function can be used as a callback for io completion under
283  * kio->ki_complete in order to implement async IO.
284  * Note that on more recent kernels there is no ret2.
285  */
286 __maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
287 {
288 	struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);
289 
290 	dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);
291 
292 	/* Do something to copy result to user / notify completion */
293 
294 	hl_dio_put_iopath(io->f.ctx);
295 
296 	hl_dio_fd_unregister(&io->f);
297 }
298 
299 /*
300  * DMA disk to ASIC, wait for results. Must be invoked from the user context
301  */
302 int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,
303 		  u64 device_va, off_t off_bytes, size_t len_bytes,
304 		  size_t *len_read)
305 {
306 	struct hl_direct_io *io;
307 	ssize_t rc;
308 
309 	dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);
310 
311 	io = kzalloc(sizeof(*io), GFP_KERNEL);
312 	if (!io) {
313 		rc = -ENOMEM;
314 		goto out;
315 	}
316 
317 	*io = (struct hl_direct_io){
318 		.device_va = device_va,
319 		.len_bytes = len_bytes,
320 		.off_bytes = off_bytes,
321 		.type = READ,
322 	};
323 
324 	rc = hl_dio_fd_register(ctx, fd, &io->f);
325 	if (rc)
326 		goto kfree_io;
327 
328 	rc = hl_direct_io(hdev, io);
329 	if (rc >= 0) {
330 		*len_read = rc;
331 		rc = 0;
332 	}
333 
334 	/* This shall be called only in the case of a sync IO */
335 	hl_dio_fd_unregister(&io->f);
336 kfree_io:
337 	kfree(io);
338 out:
339 	return rc;
340 }
341 
342 static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)
343 {
344 	if (p2pr->p2ppages) {
345 		vfree(p2pr->p2ppages);
346 		p2pr->p2ppages = NULL;
347 	}
348 
349 	if (p2pr->p2pmem) {
350 		dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
351 				p2pr->p2pmem, p2pr->size);
352 		pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
353 		p2pr->p2pmem = NULL;
354 	}
355 }
356 
357 void hl_p2p_region_fini_all(struct hl_device *hdev)
358 {
359 	int i;
360 
361 	for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
362 		hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);
363 
364 	kvfree(hdev->hldio.p2prs);
365 	hdev->hldio.p2prs = NULL;
366 	hdev->hldio.np2prs = 0;
367 }
368 
369 int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)
370 {
371 	void *addr;
372 	int rc, i;
373 
374 	/* Start by publishing our p2p memory */
375 	rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
376 	if (rc) {
377 		dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
378 		goto err;
379 	}
380 
381 	/* Alloc all p2p mem */
382 	p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
383 	if (!p2pr->p2pmem) {
384 		dev_err(hdev->dev, "error allocating p2p memory\n");
385 		rc = -ENOMEM;
386 		goto err;
387 	}
388 
389 	p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
390 	if (!p2pr->p2ppages) {
391 		rc = -ENOMEM;
392 		goto err;
393 	}
394 
395 	for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
396 		p2pr->p2ppages[i] = virt_to_page(addr);
397 		if (!p2pr->p2ppages[i]) {
398 			rc = -EFAULT;
399 			goto err;
400 		}
401 	}
402 
403 	return 0;
404 err:
405 	hl_p2p_region_fini(hdev, p2pr);
406 	return rc;
407 }
408 
409 int hl_dio_start(struct hl_device *hdev)
410 {
411 	dev_dbg(hdev->dev, "initializing HLDIO\n");
412 
413 	/* Initialize the IO counter and enable IO */
414 	hdev->hldio.inflight_ios = alloc_percpu(s64);
415 	if (!hdev->hldio.inflight_ios)
416 		return -ENOMEM;
417 
418 	hl_dio_set_io_enabled(hdev, true);
419 
420 	return 0;
421 }
422 
423 void hl_dio_stop(struct hl_device *hdev)
424 {
425 	dev_dbg(hdev->dev, "deinitializing HLDIO\n");
426 
427 	if (hdev->hldio.io_enabled) {
428 		/* Wait for all the IO to finish */
429 		hl_dio_set_io_enabled(hdev, false);
430 		hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
431 	}
432 
433 	if (hdev->hldio.inflight_ios) {
434 		free_percpu(hdev->hldio.inflight_ios);
435 		hdev->hldio.inflight_ios = NULL;
436 	}
437 }
438