1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Copyright 2024 HabanaLabs, Ltd. 5 * All Rights Reserved. 6 */ 7 8 #include "habanalabs.h" 9 #include "hldio.h" 10 #include <generated/uapi/linux/version.h> 11 #include <linux/pci-p2pdma.h> 12 #include <linux/blkdev.h> 13 #include <linux/vmalloc.h> 14 15 /* 16 * NVMe Direct I/O implementation for habanalabs driver 17 * 18 * ASSUMPTIONS 19 * =========== 20 * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless). 21 * 2. Only READ operations (can extend in the future). 22 * 3. No sparse files (can overcome this in the future). 23 * 4. Kernel version >= 6.9 24 * 5. Requiring page alignment is OK (I don't see a solution to this one right, 25 * now, how do we read partial pages?) 26 * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel. 27 * Theoretically I have a slight idea on how this could be solvable, but it 28 * is probably inacceptable for the upstream. Also may not work in the end. 29 * 7. Either make sure our cards and disks are under the same PCI bridge, or 30 * compile a custom kernel to hack around this. 31 */ 32 33 #define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */ 34 35 /* 36 * This struct contains all the useful data I could milk out of the file handle 37 * provided by the user. 38 * @TODO: right now it is retrieved on each IO, but can be done once with some 39 * dedicated IOCTL, call it for example HL_REGISTER_HANDLE. 40 */ 41 struct hl_dio_fd { 42 /* Back pointer in case we need it in async completion */ 43 struct hl_ctx *ctx; 44 /* Associated fd struct */ 45 struct file *filp; 46 }; 47 48 /* 49 * This is a single IO descriptor 50 */ 51 struct hl_direct_io { 52 struct hl_dio_fd f; 53 struct kiocb kio; 54 struct bio_vec *bv; 55 struct iov_iter iter; 56 u64 device_va; 57 u64 off_bytes; 58 u64 len_bytes; 59 u32 type; 60 }; 61 62 bool hl_device_supports_nvme(struct hl_device *hdev) 63 { 64 return hdev->asic_prop.supports_nvme; 65 } 66 67 static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f) 68 { 69 struct hl_device *hdev = ctx->hdev; 70 struct block_device *bd; 71 struct super_block *sb; 72 struct inode *inode; 73 struct gendisk *gd; 74 struct device *disk_dev; 75 int rc; 76 77 f->filp = fget(fd); 78 if (!f->filp) { 79 rc = -ENOENT; 80 goto out; 81 } 82 83 if (!(f->filp->f_flags & O_DIRECT)) { 84 dev_err(hdev->dev, "file is not in the direct mode\n"); 85 rc = -EINVAL; 86 goto fput; 87 } 88 89 if (!f->filp->f_op->read_iter) { 90 dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n"); 91 rc = -EINVAL; 92 goto fput; 93 } 94 95 inode = file_inode(f->filp); 96 sb = inode->i_sb; 97 bd = sb->s_bdev; 98 gd = bd->bd_disk; 99 100 if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) { 101 dev_err(hdev->dev, "sparse files are not currently supported\n"); 102 rc = -EINVAL; 103 goto fput; 104 } 105 106 if (!bd || !gd) { 107 dev_err(hdev->dev, "invalid block device\n"); 108 rc = -ENODEV; 109 goto fput; 110 } 111 /* Get the underlying device from the block device */ 112 disk_dev = disk_to_dev(gd); 113 if (!dma_pci_p2pdma_supported(disk_dev)) { 114 dev_err(hdev->dev, "device does not support PCI P2P DMA\n"); 115 rc = -EOPNOTSUPP; 116 goto fput; 117 } 118 119 /* 120 * @TODO: Maybe we need additional checks here 121 */ 122 123 f->ctx = ctx; 124 rc = 0; 125 126 goto out; 127 fput: 128 fput(f->filp); 129 out: 130 return rc; 131 } 132 133 static void hl_dio_fd_unregister(struct hl_dio_fd *f) 134 { 135 fput(f->filp); 136 } 137 138 static long hl_dio_count_io(struct hl_device *hdev) 139 { 140 s64 sum = 0; 141 int i; 142 143 for_each_possible_cpu(i) 144 sum += per_cpu(*hdev->hldio.inflight_ios, i); 145 146 return sum; 147 } 148 149 static bool hl_dio_get_iopath(struct hl_ctx *ctx) 150 { 151 struct hl_device *hdev = ctx->hdev; 152 153 if (hdev->hldio.io_enabled) { 154 this_cpu_inc(*hdev->hldio.inflight_ios); 155 156 /* Avoid race conditions */ 157 if (!hdev->hldio.io_enabled) { 158 this_cpu_dec(*hdev->hldio.inflight_ios); 159 return false; 160 } 161 162 hl_ctx_get(ctx); 163 164 return true; 165 } 166 167 return false; 168 } 169 170 static void hl_dio_put_iopath(struct hl_ctx *ctx) 171 { 172 struct hl_device *hdev = ctx->hdev; 173 174 hl_ctx_put(ctx); 175 this_cpu_dec(*hdev->hldio.inflight_ios); 176 } 177 178 static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled) 179 { 180 hdev->hldio.io_enabled = enabled; 181 } 182 183 static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io) 184 { 185 if ((u64)io->device_va & ~PAGE_MASK) { 186 dev_dbg(hdev->dev, "device address must be 4K aligned\n"); 187 return false; 188 } 189 190 if (io->len_bytes & ~PAGE_MASK) { 191 dev_dbg(hdev->dev, "IO length must be 4K aligned\n"); 192 return false; 193 } 194 195 if (io->off_bytes & ~PAGE_MASK) { 196 dev_dbg(hdev->dev, "IO offset must be 4K aligned\n"); 197 return false; 198 } 199 200 return true; 201 } 202 203 static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va) 204 { 205 struct hl_dio *hldio = &hdev->hldio; 206 u64 device_pa; 207 int rc, i; 208 209 rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa); 210 if (rc) { 211 dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)", 212 device_va, rc); 213 return NULL; 214 } 215 216 for (i = 0 ; i < hldio->np2prs ; ++i) { 217 if (device_pa >= hldio->p2prs[i].device_pa && 218 device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size) 219 return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >> 220 PAGE_SHIFT]; 221 } 222 223 return NULL; 224 } 225 226 static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io) 227 { 228 u64 npages, device_va; 229 ssize_t rc; 230 int i; 231 232 if (!hl_dio_validate_io(hdev, io)) 233 return -EINVAL; 234 235 if (!hl_dio_get_iopath(io->f.ctx)) { 236 dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n"); 237 return -ESHUTDOWN; 238 } 239 240 init_sync_kiocb(&io->kio, io->f.filp); 241 io->kio.ki_pos = io->off_bytes; 242 243 npages = (io->len_bytes >> PAGE_SHIFT); 244 245 /* @TODO: this can be implemented smarter, vmalloc in iopath is not 246 * ideal. Maybe some variation of genpool. Number of pages may differ 247 * greatly, so maybe even use pools of different sizes and chose the 248 * closest one. 249 */ 250 io->bv = vzalloc(npages * sizeof(struct bio_vec)); 251 if (!io->bv) 252 return -ENOMEM; 253 254 for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) { 255 io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va); 256 if (!io->bv[i].bv_page) { 257 dev_err(hdev->dev, "error getting page struct for device va %#llx", 258 device_va); 259 rc = -EFAULT; 260 goto cleanup; 261 } 262 io->bv[i].bv_offset = 0; 263 io->bv[i].bv_len = PAGE_SIZE; 264 } 265 266 iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes); 267 if (io->f.filp->f_op && io->f.filp->f_op->read_iter) 268 rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter); 269 else 270 rc = -EINVAL; 271 272 cleanup: 273 vfree(io->bv); 274 hl_dio_put_iopath(io->f.ctx); 275 276 dev_dbg(hdev->dev, "IO ended with %ld\n", rc); 277 278 return rc; 279 } 280 281 /* 282 * @TODO: This function can be used as a callback for io completion under 283 * kio->ki_complete in order to implement async IO. 284 * Note that on more recent kernels there is no ret2. 285 */ 286 __maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2) 287 { 288 struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio); 289 290 dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret); 291 292 /* Do something to copy result to user / notify completion */ 293 294 hl_dio_put_iopath(io->f.ctx); 295 296 hl_dio_fd_unregister(&io->f); 297 } 298 299 /* 300 * DMA disk to ASIC, wait for results. Must be invoked from the user context 301 */ 302 int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd, 303 u64 device_va, off_t off_bytes, size_t len_bytes, 304 size_t *len_read) 305 { 306 struct hl_direct_io *io; 307 ssize_t rc; 308 309 dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes); 310 311 io = kzalloc(sizeof(*io), GFP_KERNEL); 312 if (!io) { 313 rc = -ENOMEM; 314 goto out; 315 } 316 317 *io = (struct hl_direct_io){ 318 .device_va = device_va, 319 .len_bytes = len_bytes, 320 .off_bytes = off_bytes, 321 .type = READ, 322 }; 323 324 rc = hl_dio_fd_register(ctx, fd, &io->f); 325 if (rc) 326 goto kfree_io; 327 328 rc = hl_direct_io(hdev, io); 329 if (rc >= 0) { 330 *len_read = rc; 331 rc = 0; 332 } 333 334 /* This shall be called only in the case of a sync IO */ 335 hl_dio_fd_unregister(&io->f); 336 kfree_io: 337 kfree(io); 338 out: 339 return rc; 340 } 341 342 static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr) 343 { 344 if (p2pr->p2ppages) { 345 vfree(p2pr->p2ppages); 346 p2pr->p2ppages = NULL; 347 } 348 349 if (p2pr->p2pmem) { 350 dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n", 351 p2pr->p2pmem, p2pr->size); 352 pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size); 353 p2pr->p2pmem = NULL; 354 } 355 } 356 357 void hl_p2p_region_fini_all(struct hl_device *hdev) 358 { 359 int i; 360 361 for (i = 0 ; i < hdev->hldio.np2prs ; ++i) 362 hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]); 363 364 kvfree(hdev->hldio.p2prs); 365 hdev->hldio.p2prs = NULL; 366 hdev->hldio.np2prs = 0; 367 } 368 369 int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr) 370 { 371 void *addr; 372 int rc, i; 373 374 /* Start by publishing our p2p memory */ 375 rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset); 376 if (rc) { 377 dev_err(hdev->dev, "error adding p2p resource: %d\n", rc); 378 goto err; 379 } 380 381 /* Alloc all p2p mem */ 382 p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size); 383 if (!p2pr->p2pmem) { 384 dev_err(hdev->dev, "error allocating p2p memory\n"); 385 rc = -ENOMEM; 386 goto err; 387 } 388 389 p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *)); 390 if (!p2pr->p2ppages) { 391 rc = -ENOMEM; 392 goto err; 393 } 394 395 for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) { 396 p2pr->p2ppages[i] = virt_to_page(addr); 397 if (!p2pr->p2ppages[i]) { 398 rc = -EFAULT; 399 goto err; 400 } 401 } 402 403 return 0; 404 err: 405 hl_p2p_region_fini(hdev, p2pr); 406 return rc; 407 } 408 409 int hl_dio_start(struct hl_device *hdev) 410 { 411 dev_dbg(hdev->dev, "initializing HLDIO\n"); 412 413 /* Initialize the IO counter and enable IO */ 414 hdev->hldio.inflight_ios = alloc_percpu(s64); 415 if (!hdev->hldio.inflight_ios) 416 return -ENOMEM; 417 418 hl_dio_set_io_enabled(hdev, true); 419 420 return 0; 421 } 422 423 void hl_dio_stop(struct hl_device *hdev) 424 { 425 dev_dbg(hdev->dev, "deinitializing HLDIO\n"); 426 427 if (hdev->hldio.io_enabled) { 428 /* Wait for all the IO to finish */ 429 hl_dio_set_io_enabled(hdev, false); 430 hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT); 431 } 432 433 if (hdev->hldio.inflight_ios) { 434 free_percpu(hdev->hldio.inflight_ios); 435 hdev->hldio.inflight_ios = NULL; 436 } 437 } 438