1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/module.h> 6 #include <linux/device.h> 7 #include <linux/cdev.h> 8 #include <linux/slab.h> 9 #include <linux/dax.h> 10 #include <linux/fs.h> 11 #include <linux/mm.h> 12 #include <linux/mman.h> 13 #include "dax-private.h" 14 #include "bus.h" 15 16 static int __check_vma(struct dev_dax *dev_dax, vma_flags_t flags, 17 unsigned long start, unsigned long end, struct file *file, 18 const char *func) 19 { 20 struct device *dev = &dev_dax->dev; 21 unsigned long mask; 22 23 if (!dax_alive(dev_dax->dax_dev)) 24 return -ENXIO; 25 26 /* prevent private mappings from being established */ 27 if (!vma_flags_test_any(&flags, VMA_MAYSHARE_BIT)) { 28 dev_info_ratelimited(dev, 29 "%s: %s: fail, attempted private mapping\n", 30 current->comm, func); 31 return -EINVAL; 32 } 33 34 mask = dev_dax->align - 1; 35 if (start & mask || end & mask) { 36 dev_info_ratelimited(dev, 37 "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", 38 current->comm, func, start, end, 39 mask); 40 return -EINVAL; 41 } 42 43 if (!file_is_dax(file)) { 44 dev_info_ratelimited(dev, 45 "%s: %s: fail, vma is not DAX capable\n", 46 current->comm, func); 47 return -EINVAL; 48 } 49 50 return 0; 51 } 52 53 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 54 const char *func) 55 { 56 return __check_vma(dev_dax, vma->flags, vma->vm_start, vma->vm_end, 57 vma->vm_file, func); 58 } 59 60 static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, 61 unsigned long fault_size) 62 { 63 unsigned long i, nr_pages = fault_size / PAGE_SIZE; 64 struct file *filp = vmf->vma->vm_file; 65 struct dev_dax *dev_dax = filp->private_data; 66 pgoff_t pgoff; 67 68 /* mapping is only set on the head */ 69 if (dev_dax->pgmap->vmemmap_shift) 70 nr_pages = 1; 71 72 pgoff = linear_page_index(vmf->vma, 73 ALIGN_DOWN(vmf->address, fault_size)); 74 75 for (i = 0; i < nr_pages; i++) { 76 struct folio *folio = pfn_folio(pfn + i); 77 78 if (folio->mapping) 79 continue; 80 81 folio->mapping = filp->f_mapping; 82 folio->index = pgoff + i; 83 } 84 } 85 86 static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, 87 struct vm_fault *vmf) 88 { 89 struct device *dev = &dev_dax->dev; 90 phys_addr_t phys; 91 unsigned long pfn; 92 unsigned int fault_size = PAGE_SIZE; 93 94 if (check_vma(dev_dax, vmf->vma, __func__)) 95 return VM_FAULT_SIGBUS; 96 97 if (dev_dax->align > PAGE_SIZE) { 98 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 99 dev_dax->align, fault_size); 100 return VM_FAULT_SIGBUS; 101 } 102 103 if (fault_size != dev_dax->align) 104 return VM_FAULT_SIGBUS; 105 106 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 107 if (phys == -1) { 108 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", vmf->pgoff); 109 return VM_FAULT_SIGBUS; 110 } 111 112 pfn = PHYS_PFN(phys); 113 114 dax_set_mapping(vmf, pfn, fault_size); 115 116 return vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), 117 vmf->flags & FAULT_FLAG_WRITE); 118 } 119 120 static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, 121 struct vm_fault *vmf) 122 { 123 unsigned long pmd_addr = vmf->address & PMD_MASK; 124 struct device *dev = &dev_dax->dev; 125 phys_addr_t phys; 126 pgoff_t pgoff; 127 unsigned long pfn; 128 unsigned int fault_size = PMD_SIZE; 129 130 if (check_vma(dev_dax, vmf->vma, __func__)) 131 return VM_FAULT_SIGBUS; 132 133 if (dev_dax->align > PMD_SIZE) { 134 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 135 dev_dax->align, fault_size); 136 return VM_FAULT_SIGBUS; 137 } 138 139 if (fault_size < dev_dax->align) 140 return VM_FAULT_SIGBUS; 141 else if (fault_size > dev_dax->align) 142 return VM_FAULT_FALLBACK; 143 144 /* if we are outside of the VMA */ 145 if (pmd_addr < vmf->vma->vm_start || 146 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 147 return VM_FAULT_SIGBUS; 148 149 pgoff = linear_page_index(vmf->vma, pmd_addr); 150 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 151 if (phys == -1) { 152 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); 153 return VM_FAULT_SIGBUS; 154 } 155 156 pfn = PHYS_PFN(phys); 157 158 dax_set_mapping(vmf, pfn, fault_size); 159 160 return vmf_insert_folio_pmd(vmf, page_folio(pfn_to_page(pfn)), 161 vmf->flags & FAULT_FLAG_WRITE); 162 } 163 164 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 165 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, 166 struct vm_fault *vmf) 167 { 168 unsigned long pud_addr = vmf->address & PUD_MASK; 169 struct device *dev = &dev_dax->dev; 170 phys_addr_t phys; 171 pgoff_t pgoff; 172 unsigned long pfn; 173 unsigned int fault_size = PUD_SIZE; 174 175 176 if (check_vma(dev_dax, vmf->vma, __func__)) 177 return VM_FAULT_SIGBUS; 178 179 if (dev_dax->align > PUD_SIZE) { 180 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 181 dev_dax->align, fault_size); 182 return VM_FAULT_SIGBUS; 183 } 184 185 if (fault_size < dev_dax->align) 186 return VM_FAULT_SIGBUS; 187 else if (fault_size > dev_dax->align) 188 return VM_FAULT_FALLBACK; 189 190 /* if we are outside of the VMA */ 191 if (pud_addr < vmf->vma->vm_start || 192 (pud_addr + PUD_SIZE) > vmf->vma->vm_end) 193 return VM_FAULT_SIGBUS; 194 195 pgoff = linear_page_index(vmf->vma, pud_addr); 196 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 197 if (phys == -1) { 198 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); 199 return VM_FAULT_SIGBUS; 200 } 201 202 pfn = PHYS_PFN(phys); 203 204 dax_set_mapping(vmf, pfn, fault_size); 205 206 return vmf_insert_folio_pud(vmf, page_folio(pfn_to_page(pfn)), 207 vmf->flags & FAULT_FLAG_WRITE); 208 } 209 #else 210 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, 211 struct vm_fault *vmf) 212 { 213 return VM_FAULT_FALLBACK; 214 } 215 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 216 217 static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) 218 { 219 struct file *filp = vmf->vma->vm_file; 220 vm_fault_t rc = VM_FAULT_SIGBUS; 221 int id; 222 struct dev_dax *dev_dax = filp->private_data; 223 224 dev_dbg(&dev_dax->dev, "%s: op=%s addr=%#lx order=%d\n", current->comm, 225 (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", 226 vmf->address & ~((1UL << (order + PAGE_SHIFT)) - 1), order); 227 228 id = dax_read_lock(); 229 if (order == 0) 230 rc = __dev_dax_pte_fault(dev_dax, vmf); 231 else if (order == PMD_ORDER) 232 rc = __dev_dax_pmd_fault(dev_dax, vmf); 233 else if (order == PUD_ORDER) 234 rc = __dev_dax_pud_fault(dev_dax, vmf); 235 else 236 rc = VM_FAULT_SIGBUS; 237 238 dax_read_unlock(id); 239 240 return rc; 241 } 242 243 static vm_fault_t dev_dax_fault(struct vm_fault *vmf) 244 { 245 return dev_dax_huge_fault(vmf, 0); 246 } 247 248 static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) 249 { 250 struct file *filp = vma->vm_file; 251 struct dev_dax *dev_dax = filp->private_data; 252 253 if (!IS_ALIGNED(addr, dev_dax->align)) 254 return -EINVAL; 255 return 0; 256 } 257 258 static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) 259 { 260 struct file *filp = vma->vm_file; 261 struct dev_dax *dev_dax = filp->private_data; 262 263 return dev_dax->align; 264 } 265 266 static const struct vm_operations_struct dax_vm_ops = { 267 .fault = dev_dax_fault, 268 .huge_fault = dev_dax_huge_fault, 269 .may_split = dev_dax_may_split, 270 .pagesize = dev_dax_pagesize, 271 }; 272 273 static int dax_mmap_prepare(struct vm_area_desc *desc) 274 { 275 struct file *filp = desc->file; 276 struct dev_dax *dev_dax = filp->private_data; 277 int rc, id; 278 279 dev_dbg(&dev_dax->dev, "trace\n"); 280 281 /* 282 * We lock to check dax_dev liveness and will re-check at 283 * fault time. 284 */ 285 id = dax_read_lock(); 286 rc = __check_vma(dev_dax, desc->vma_flags, desc->start, desc->end, filp, 287 __func__); 288 dax_read_unlock(id); 289 if (rc) 290 return rc; 291 292 desc->vm_ops = &dax_vm_ops; 293 vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT); 294 return 0; 295 } 296 297 /* return an unmapped area aligned to the dax region specified alignment */ 298 static unsigned long dax_get_unmapped_area(struct file *filp, 299 unsigned long addr, unsigned long len, unsigned long pgoff, 300 unsigned long flags) 301 { 302 unsigned long off, off_end, off_align, len_align, addr_align, align; 303 struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 304 305 if (!dev_dax || addr) 306 goto out; 307 308 align = dev_dax->align; 309 off = pgoff << PAGE_SHIFT; 310 off_end = off + len; 311 off_align = round_up(off, align); 312 313 if ((off_end <= off_align) || ((off_end - off_align) < align)) 314 goto out; 315 316 len_align = len + align; 317 if ((off + len_align) < off) 318 goto out; 319 320 addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); 321 if (!IS_ERR_VALUE(addr_align)) { 322 addr_align += (off - addr_align) & (align - 1); 323 return addr_align; 324 } 325 out: 326 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 327 } 328 329 static const struct address_space_operations dev_dax_aops = { 330 .dirty_folio = noop_dirty_folio, 331 }; 332 333 static int dax_open(struct inode *inode, struct file *filp) 334 { 335 struct dax_device *dax_dev = inode_dax(inode); 336 struct inode *__dax_inode = dax_inode(dax_dev); 337 struct dev_dax *dev_dax = dax_get_private(dax_dev); 338 339 dev_dbg(&dev_dax->dev, "trace\n"); 340 inode->i_mapping = __dax_inode->i_mapping; 341 inode->i_mapping->host = __dax_inode; 342 inode->i_mapping->a_ops = &dev_dax_aops; 343 filp->f_mapping = inode->i_mapping; 344 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 345 filp->f_sb_err = file_sample_sb_err(filp); 346 filp->private_data = dev_dax; 347 inode->i_flags = S_DAX; 348 349 return 0; 350 } 351 352 static int dax_release(struct inode *inode, struct file *filp) 353 { 354 struct dev_dax *dev_dax = filp->private_data; 355 356 dev_dbg(&dev_dax->dev, "trace\n"); 357 return 0; 358 } 359 360 static const struct file_operations dax_fops = { 361 .llseek = noop_llseek, 362 .owner = THIS_MODULE, 363 .open = dax_open, 364 .release = dax_release, 365 .get_unmapped_area = dax_get_unmapped_area, 366 .mmap_prepare = dax_mmap_prepare, 367 .fop_flags = FOP_MMAP_SYNC, 368 }; 369 370 static void dev_dax_cdev_del(void *cdev) 371 { 372 cdev_del(cdev); 373 } 374 375 static void dev_dax_kill(void *dev_dax) 376 { 377 kill_dev_dax(dev_dax); 378 } 379 380 static int dev_dax_probe(struct dev_dax *dev_dax) 381 { 382 struct dax_device *dax_dev = dev_dax->dax_dev; 383 struct device *dev = &dev_dax->dev; 384 struct dev_pagemap *pgmap; 385 struct inode *inode; 386 struct cdev *cdev; 387 void *addr; 388 int rc, i; 389 390 if (static_dev_dax(dev_dax)) { 391 if (dev_dax->nr_range > 1) { 392 dev_warn(dev, 393 "static pgmap / multi-range device conflict\n"); 394 return -EINVAL; 395 } 396 397 pgmap = dev_dax->pgmap; 398 } else { 399 if (dev_dax->pgmap) { 400 dev_warn(dev, 401 "dynamic-dax with pre-populated page map\n"); 402 return -EINVAL; 403 } 404 405 pgmap = devm_kzalloc(dev, 406 struct_size(pgmap, ranges, dev_dax->nr_range - 1), 407 GFP_KERNEL); 408 if (!pgmap) 409 return -ENOMEM; 410 411 pgmap->nr_range = dev_dax->nr_range; 412 dev_dax->pgmap = pgmap; 413 414 for (i = 0; i < dev_dax->nr_range; i++) { 415 struct range *range = &dev_dax->ranges[i].range; 416 pgmap->ranges[i] = *range; 417 } 418 } 419 420 for (i = 0; i < dev_dax->nr_range; i++) { 421 struct range *range = &dev_dax->ranges[i].range; 422 423 if (!devm_request_mem_region(dev, range->start, 424 range_len(range), dev_name(dev))) { 425 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 426 i, range->start, range->end); 427 return -EBUSY; 428 } 429 } 430 431 pgmap->type = MEMORY_DEVICE_GENERIC; 432 if (dev_dax->align > PAGE_SIZE) 433 pgmap->vmemmap_shift = 434 order_base_2(dev_dax->align >> PAGE_SHIFT); 435 addr = devm_memremap_pages(dev, pgmap); 436 if (IS_ERR(addr)) 437 return PTR_ERR(addr); 438 439 inode = dax_inode(dax_dev); 440 cdev = inode->i_cdev; 441 cdev_init(cdev, &dax_fops); 442 cdev->owner = dev->driver->owner; 443 cdev_set_parent(cdev, &dev->kobj); 444 rc = cdev_add(cdev, dev->devt, 1); 445 if (rc) 446 return rc; 447 448 rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); 449 if (rc) 450 return rc; 451 452 run_dax(dax_dev); 453 return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); 454 } 455 456 static struct dax_device_driver device_dax_driver = { 457 .probe = dev_dax_probe, 458 .type = DAXDRV_DEVICE_TYPE, 459 }; 460 461 static int __init dax_init(void) 462 { 463 return dax_driver_register(&device_dax_driver); 464 } 465 466 static void __exit dax_exit(void) 467 { 468 dax_driver_unregister(&device_dax_driver); 469 } 470 471 MODULE_AUTHOR("Intel Corporation"); 472 MODULE_DESCRIPTION("Device DAX: direct access device driver"); 473 MODULE_LICENSE("GPL v2"); 474 module_init(dax_init); 475 module_exit(dax_exit); 476 MODULE_ALIAS_DAX_DEVICE(0); 477