1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2024 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 */
7
8 #include "habanalabs.h"
9 #include "hldio.h"
10 #include <generated/uapi/linux/version.h>
11 #include <linux/pci-p2pdma.h>
12 #include <linux/blkdev.h>
13 #include <linux/vmalloc.h>
14
15 /*
16 * NVMe Direct I/O implementation for habanalabs driver
17 *
18 * ASSUMPTIONS
19 * ===========
20 * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
21 * 2. Only READ operations (can extend in the future).
22 * 3. No sparse files (can overcome this in the future).
23 * 4. Kernel version >= 6.9
24 * 5. Requiring page alignment is OK (I don't see a solution to this one right,
25 * now, how do we read partial pages?)
26 * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
27 * Theoretically I have a slight idea on how this could be solvable, but it
28 * is probably inacceptable for the upstream. Also may not work in the end.
29 * 7. Either make sure our cards and disks are under the same PCI bridge, or
30 * compile a custom kernel to hack around this.
31 */
32
33 #define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */
34
35 /*
36 * This struct contains all the useful data I could milk out of the file handle
37 * provided by the user.
38 * @TODO: right now it is retrieved on each IO, but can be done once with some
39 * dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
40 */
41 struct hl_dio_fd {
42 /* Back pointer in case we need it in async completion */
43 struct hl_ctx *ctx;
44 /* Associated fd struct */
45 struct file *filp;
46 };
47
48 /*
49 * This is a single IO descriptor
50 */
51 struct hl_direct_io {
52 struct hl_dio_fd f;
53 struct kiocb kio;
54 struct bio_vec *bv;
55 struct iov_iter iter;
56 u64 device_va;
57 u64 off_bytes;
58 u64 len_bytes;
59 u32 type;
60 };
61
hl_device_supports_nvme(struct hl_device * hdev)62 bool hl_device_supports_nvme(struct hl_device *hdev)
63 {
64 return hdev->asic_prop.supports_nvme;
65 }
66
hl_dio_fd_register(struct hl_ctx * ctx,int fd,struct hl_dio_fd * f)67 static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)
68 {
69 struct hl_device *hdev = ctx->hdev;
70 struct block_device *bd;
71 struct super_block *sb;
72 struct inode *inode;
73 struct gendisk *gd;
74 struct device *disk_dev;
75 int rc;
76
77 f->filp = fget(fd);
78 if (!f->filp) {
79 rc = -ENOENT;
80 goto out;
81 }
82
83 if (!(f->filp->f_flags & O_DIRECT)) {
84 dev_err(hdev->dev, "file is not in the direct mode\n");
85 rc = -EINVAL;
86 goto fput;
87 }
88
89 if (!f->filp->f_op->read_iter) {
90 dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
91 rc = -EINVAL;
92 goto fput;
93 }
94
95 inode = file_inode(f->filp);
96 sb = inode->i_sb;
97 bd = sb->s_bdev;
98 gd = bd->bd_disk;
99
100 if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
101 dev_err(hdev->dev, "sparse files are not currently supported\n");
102 rc = -EINVAL;
103 goto fput;
104 }
105
106 if (!bd || !gd) {
107 dev_err(hdev->dev, "invalid block device\n");
108 rc = -ENODEV;
109 goto fput;
110 }
111 /* Get the underlying device from the block device */
112 disk_dev = disk_to_dev(gd);
113 if (!dma_pci_p2pdma_supported(disk_dev)) {
114 dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
115 rc = -EOPNOTSUPP;
116 goto fput;
117 }
118
119 /*
120 * @TODO: Maybe we need additional checks here
121 */
122
123 f->ctx = ctx;
124 rc = 0;
125
126 goto out;
127 fput:
128 fput(f->filp);
129 out:
130 return rc;
131 }
132
hl_dio_fd_unregister(struct hl_dio_fd * f)133 static void hl_dio_fd_unregister(struct hl_dio_fd *f)
134 {
135 fput(f->filp);
136 }
137
hl_dio_count_io(struct hl_device * hdev)138 static long hl_dio_count_io(struct hl_device *hdev)
139 {
140 s64 sum = 0;
141 int i;
142
143 for_each_possible_cpu(i)
144 sum += per_cpu(*hdev->hldio.inflight_ios, i);
145
146 return sum;
147 }
148
hl_dio_get_iopath(struct hl_ctx * ctx)149 static bool hl_dio_get_iopath(struct hl_ctx *ctx)
150 {
151 struct hl_device *hdev = ctx->hdev;
152
153 if (hdev->hldio.io_enabled) {
154 this_cpu_inc(*hdev->hldio.inflight_ios);
155
156 /* Avoid race conditions */
157 if (!hdev->hldio.io_enabled) {
158 this_cpu_dec(*hdev->hldio.inflight_ios);
159 return false;
160 }
161
162 hl_ctx_get(ctx);
163
164 return true;
165 }
166
167 return false;
168 }
169
hl_dio_put_iopath(struct hl_ctx * ctx)170 static void hl_dio_put_iopath(struct hl_ctx *ctx)
171 {
172 struct hl_device *hdev = ctx->hdev;
173
174 hl_ctx_put(ctx);
175 this_cpu_dec(*hdev->hldio.inflight_ios);
176 }
177
hl_dio_set_io_enabled(struct hl_device * hdev,bool enabled)178 static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
179 {
180 hdev->hldio.io_enabled = enabled;
181 }
182
hl_dio_validate_io(struct hl_device * hdev,struct hl_direct_io * io)183 static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)
184 {
185 if ((u64)io->device_va & ~PAGE_MASK) {
186 dev_dbg(hdev->dev, "device address must be 4K aligned\n");
187 return false;
188 }
189
190 if (io->len_bytes & ~PAGE_MASK) {
191 dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
192 return false;
193 }
194
195 if (io->off_bytes & ~PAGE_MASK) {
196 dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
197 return false;
198 }
199
200 return true;
201 }
202
hl_dio_va2page(struct hl_device * hdev,struct hl_ctx * ctx,u64 device_va)203 static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)
204 {
205 struct hl_dio *hldio = &hdev->hldio;
206 u64 device_pa;
207 int rc, i;
208
209 rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
210 if (rc) {
211 dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
212 device_va, rc);
213 return NULL;
214 }
215
216 for (i = 0 ; i < hldio->np2prs ; ++i) {
217 if (device_pa >= hldio->p2prs[i].device_pa &&
218 device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
219 return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
220 PAGE_SHIFT];
221 }
222
223 return NULL;
224 }
225
hl_direct_io(struct hl_device * hdev,struct hl_direct_io * io)226 static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)
227 {
228 u64 npages, device_va;
229 ssize_t rc;
230 int i;
231
232 if (!hl_dio_validate_io(hdev, io))
233 return -EINVAL;
234
235 if (!hl_dio_get_iopath(io->f.ctx)) {
236 dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
237 return -ESHUTDOWN;
238 }
239
240 init_sync_kiocb(&io->kio, io->f.filp);
241 io->kio.ki_pos = io->off_bytes;
242
243 npages = (io->len_bytes >> PAGE_SHIFT);
244
245 /* @TODO: this can be implemented smarter, vmalloc in iopath is not
246 * ideal. Maybe some variation of genpool. Number of pages may differ
247 * greatly, so maybe even use pools of different sizes and chose the
248 * closest one.
249 */
250 io->bv = vzalloc(npages * sizeof(struct bio_vec));
251 if (!io->bv)
252 return -ENOMEM;
253
254 for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
255 io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
256 if (!io->bv[i].bv_page) {
257 dev_err(hdev->dev, "error getting page struct for device va %#llx",
258 device_va);
259 rc = -EFAULT;
260 goto cleanup;
261 }
262 io->bv[i].bv_offset = 0;
263 io->bv[i].bv_len = PAGE_SIZE;
264 }
265
266 iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
267 if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
268 rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
269 else
270 rc = -EINVAL;
271
272 cleanup:
273 vfree(io->bv);
274 hl_dio_put_iopath(io->f.ctx);
275
276 dev_dbg(hdev->dev, "IO ended with %ld\n", rc);
277
278 return rc;
279 }
280
281 /*
282 * @TODO: This function can be used as a callback for io completion under
283 * kio->ki_complete in order to implement async IO.
284 * Note that on more recent kernels there is no ret2.
285 */
hl_direct_io_complete(struct kiocb * kio,long ret,long ret2)286 __maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
287 {
288 struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);
289
290 dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);
291
292 /* Do something to copy result to user / notify completion */
293
294 hl_dio_put_iopath(io->f.ctx);
295
296 hl_dio_fd_unregister(&io->f);
297 }
298
299 /*
300 * DMA disk to ASIC, wait for results. Must be invoked from the user context
301 */
hl_dio_ssd2hl(struct hl_device * hdev,struct hl_ctx * ctx,int fd,u64 device_va,off_t off_bytes,size_t len_bytes,size_t * len_read)302 int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,
303 u64 device_va, off_t off_bytes, size_t len_bytes,
304 size_t *len_read)
305 {
306 struct hl_direct_io *io;
307 ssize_t rc;
308
309 dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);
310
311 io = kzalloc_obj(*io);
312 if (!io) {
313 rc = -ENOMEM;
314 goto out;
315 }
316
317 *io = (struct hl_direct_io){
318 .device_va = device_va,
319 .len_bytes = len_bytes,
320 .off_bytes = off_bytes,
321 .type = READ,
322 };
323
324 rc = hl_dio_fd_register(ctx, fd, &io->f);
325 if (rc)
326 goto kfree_io;
327
328 rc = hl_direct_io(hdev, io);
329 if (rc >= 0) {
330 *len_read = rc;
331 rc = 0;
332 }
333
334 /* This shall be called only in the case of a sync IO */
335 hl_dio_fd_unregister(&io->f);
336 kfree_io:
337 kfree(io);
338 out:
339 return rc;
340 }
341
hl_p2p_region_fini(struct hl_device * hdev,struct hl_p2p_region * p2pr)342 static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)
343 {
344 if (p2pr->p2ppages) {
345 vfree(p2pr->p2ppages);
346 p2pr->p2ppages = NULL;
347 }
348
349 if (p2pr->p2pmem) {
350 dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
351 p2pr->p2pmem, p2pr->size);
352 pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
353 p2pr->p2pmem = NULL;
354 }
355 }
356
hl_p2p_region_fini_all(struct hl_device * hdev)357 void hl_p2p_region_fini_all(struct hl_device *hdev)
358 {
359 int i;
360
361 for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
362 hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);
363
364 kvfree(hdev->hldio.p2prs);
365 hdev->hldio.p2prs = NULL;
366 hdev->hldio.np2prs = 0;
367 }
368
hl_p2p_region_init(struct hl_device * hdev,struct hl_p2p_region * p2pr)369 int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)
370 {
371 void *addr;
372 int rc, i;
373
374 /* Start by publishing our p2p memory */
375 rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
376 if (rc) {
377 dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
378 goto err;
379 }
380
381 /* Alloc all p2p mem */
382 p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
383 if (!p2pr->p2pmem) {
384 dev_err(hdev->dev, "error allocating p2p memory\n");
385 rc = -ENOMEM;
386 goto err;
387 }
388
389 p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
390 if (!p2pr->p2ppages) {
391 rc = -ENOMEM;
392 goto err;
393 }
394
395 for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
396 p2pr->p2ppages[i] = virt_to_page(addr);
397 if (!p2pr->p2ppages[i]) {
398 rc = -EFAULT;
399 goto err;
400 }
401 }
402
403 return 0;
404 err:
405 hl_p2p_region_fini(hdev, p2pr);
406 return rc;
407 }
408
hl_dio_start(struct hl_device * hdev)409 int hl_dio_start(struct hl_device *hdev)
410 {
411 dev_dbg(hdev->dev, "initializing HLDIO\n");
412
413 /* Initialize the IO counter and enable IO */
414 hdev->hldio.inflight_ios = alloc_percpu(s64);
415 if (!hdev->hldio.inflight_ios)
416 return -ENOMEM;
417
418 hl_dio_set_io_enabled(hdev, true);
419
420 return 0;
421 }
422
hl_dio_stop(struct hl_device * hdev)423 void hl_dio_stop(struct hl_device *hdev)
424 {
425 dev_dbg(hdev->dev, "deinitializing HLDIO\n");
426
427 if (hdev->hldio.io_enabled) {
428 /* Wait for all the IO to finish */
429 hl_dio_set_io_enabled(hdev, false);
430 hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
431 }
432
433 if (hdev->hldio.inflight_ios) {
434 free_percpu(hdev->hldio.inflight_ios);
435 hdev->hldio.inflight_ios = NULL;
436 }
437 }
438