1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-2018 Intel Corporation. All rights reserved. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/module.h> 6 #include <linux/device.h> 7 #include <linux/cdev.h> 8 #include <linux/slab.h> 9 #include <linux/dax.h> 10 #include <linux/fs.h> 11 #include <linux/mm.h> 12 #include <linux/mman.h> 13 #include "dax-private.h" 14 #include "bus.h" 15 16 static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags, 17 unsigned long start, unsigned long end, struct file *file, 18 const char *func) 19 { 20 struct device *dev = &dev_dax->dev; 21 unsigned long mask; 22 23 if (!dax_alive(dev_dax->dax_dev)) 24 return -ENXIO; 25 26 /* prevent private mappings from being established */ 27 if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { 28 dev_info_ratelimited(dev, 29 "%s: %s: fail, attempted private mapping\n", 30 current->comm, func); 31 return -EINVAL; 32 } 33 34 mask = dev_dax->align - 1; 35 if (start & mask || end & mask) { 36 dev_info_ratelimited(dev, 37 "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", 38 current->comm, func, start, end, 39 mask); 40 return -EINVAL; 41 } 42 43 if (!file_is_dax(file)) { 44 dev_info_ratelimited(dev, 45 "%s: %s: fail, vma is not DAX capable\n", 46 current->comm, func); 47 return -EINVAL; 48 } 49 50 return 0; 51 } 52 53 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 54 const char *func) 55 { 56 return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end, 57 vma->vm_file, func); 58 } 59 60 /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 61 __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 62 unsigned long size) 63 { 64 int i; 65 66 for (i = 0; i < dev_dax->nr_range; i++) { 67 struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 68 struct range *range = &dax_range->range; 69 unsigned long long pgoff_end; 70 phys_addr_t phys; 71 72 pgoff_end = dax_range->pgoff + PHYS_PFN(range_len(range)) - 1; 73 if (pgoff < dax_range->pgoff || pgoff > pgoff_end) 74 continue; 75 phys = PFN_PHYS(pgoff - dax_range->pgoff) + range->start; 76 if (phys + size - 1 <= range->end) 77 return phys; 78 break; 79 } 80 return -1; 81 } 82 83 static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn, 84 unsigned long fault_size) 85 { 86 unsigned long i, nr_pages = fault_size / PAGE_SIZE; 87 struct file *filp = vmf->vma->vm_file; 88 struct dev_dax *dev_dax = filp->private_data; 89 pgoff_t pgoff; 90 91 /* mapping is only set on the head */ 92 if (dev_dax->pgmap->vmemmap_shift) 93 nr_pages = 1; 94 95 pgoff = linear_page_index(vmf->vma, 96 ALIGN_DOWN(vmf->address, fault_size)); 97 98 for (i = 0; i < nr_pages; i++) { 99 struct folio *folio = pfn_folio(pfn + i); 100 101 if (folio->mapping) 102 continue; 103 104 folio->mapping = filp->f_mapping; 105 folio->index = pgoff + i; 106 } 107 } 108 109 static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, 110 struct vm_fault *vmf) 111 { 112 struct device *dev = &dev_dax->dev; 113 phys_addr_t phys; 114 unsigned long pfn; 115 unsigned int fault_size = PAGE_SIZE; 116 117 if (check_vma(dev_dax, vmf->vma, __func__)) 118 return VM_FAULT_SIGBUS; 119 120 if (dev_dax->align > PAGE_SIZE) { 121 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 122 dev_dax->align, fault_size); 123 return VM_FAULT_SIGBUS; 124 } 125 126 if (fault_size != dev_dax->align) 127 return VM_FAULT_SIGBUS; 128 129 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 130 if (phys == -1) { 131 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", vmf->pgoff); 132 return VM_FAULT_SIGBUS; 133 } 134 135 pfn = PHYS_PFN(phys); 136 137 dax_set_mapping(vmf, pfn, fault_size); 138 139 return vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), 140 vmf->flags & FAULT_FLAG_WRITE); 141 } 142 143 static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, 144 struct vm_fault *vmf) 145 { 146 unsigned long pmd_addr = vmf->address & PMD_MASK; 147 struct device *dev = &dev_dax->dev; 148 phys_addr_t phys; 149 pgoff_t pgoff; 150 unsigned long pfn; 151 unsigned int fault_size = PMD_SIZE; 152 153 if (check_vma(dev_dax, vmf->vma, __func__)) 154 return VM_FAULT_SIGBUS; 155 156 if (dev_dax->align > PMD_SIZE) { 157 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 158 dev_dax->align, fault_size); 159 return VM_FAULT_SIGBUS; 160 } 161 162 if (fault_size < dev_dax->align) 163 return VM_FAULT_SIGBUS; 164 else if (fault_size > dev_dax->align) 165 return VM_FAULT_FALLBACK; 166 167 /* if we are outside of the VMA */ 168 if (pmd_addr < vmf->vma->vm_start || 169 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 170 return VM_FAULT_SIGBUS; 171 172 pgoff = linear_page_index(vmf->vma, pmd_addr); 173 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 174 if (phys == -1) { 175 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); 176 return VM_FAULT_SIGBUS; 177 } 178 179 pfn = PHYS_PFN(phys); 180 181 dax_set_mapping(vmf, pfn, fault_size); 182 183 return vmf_insert_folio_pmd(vmf, page_folio(pfn_to_page(pfn)), 184 vmf->flags & FAULT_FLAG_WRITE); 185 } 186 187 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 188 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, 189 struct vm_fault *vmf) 190 { 191 unsigned long pud_addr = vmf->address & PUD_MASK; 192 struct device *dev = &dev_dax->dev; 193 phys_addr_t phys; 194 pgoff_t pgoff; 195 unsigned long pfn; 196 unsigned int fault_size = PUD_SIZE; 197 198 199 if (check_vma(dev_dax, vmf->vma, __func__)) 200 return VM_FAULT_SIGBUS; 201 202 if (dev_dax->align > PUD_SIZE) { 203 dev_dbg(dev, "alignment (%#x) > fault size (%#x)\n", 204 dev_dax->align, fault_size); 205 return VM_FAULT_SIGBUS; 206 } 207 208 if (fault_size < dev_dax->align) 209 return VM_FAULT_SIGBUS; 210 else if (fault_size > dev_dax->align) 211 return VM_FAULT_FALLBACK; 212 213 /* if we are outside of the VMA */ 214 if (pud_addr < vmf->vma->vm_start || 215 (pud_addr + PUD_SIZE) > vmf->vma->vm_end) 216 return VM_FAULT_SIGBUS; 217 218 pgoff = linear_page_index(vmf->vma, pud_addr); 219 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 220 if (phys == -1) { 221 dev_dbg(dev, "pgoff_to_phys(%#lx) failed\n", pgoff); 222 return VM_FAULT_SIGBUS; 223 } 224 225 pfn = PHYS_PFN(phys); 226 227 dax_set_mapping(vmf, pfn, fault_size); 228 229 return vmf_insert_folio_pud(vmf, page_folio(pfn_to_page(pfn)), 230 vmf->flags & FAULT_FLAG_WRITE); 231 } 232 #else 233 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, 234 struct vm_fault *vmf) 235 { 236 return VM_FAULT_FALLBACK; 237 } 238 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 239 240 static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) 241 { 242 struct file *filp = vmf->vma->vm_file; 243 vm_fault_t rc = VM_FAULT_SIGBUS; 244 int id; 245 struct dev_dax *dev_dax = filp->private_data; 246 247 dev_dbg(&dev_dax->dev, "%s: op=%s addr=%#lx order=%d\n", current->comm, 248 (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", 249 vmf->address & ~((1UL << (order + PAGE_SHIFT)) - 1), order); 250 251 id = dax_read_lock(); 252 if (order == 0) 253 rc = __dev_dax_pte_fault(dev_dax, vmf); 254 else if (order == PMD_ORDER) 255 rc = __dev_dax_pmd_fault(dev_dax, vmf); 256 else if (order == PUD_ORDER) 257 rc = __dev_dax_pud_fault(dev_dax, vmf); 258 else 259 rc = VM_FAULT_SIGBUS; 260 261 dax_read_unlock(id); 262 263 return rc; 264 } 265 266 static vm_fault_t dev_dax_fault(struct vm_fault *vmf) 267 { 268 return dev_dax_huge_fault(vmf, 0); 269 } 270 271 static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) 272 { 273 struct file *filp = vma->vm_file; 274 struct dev_dax *dev_dax = filp->private_data; 275 276 if (!IS_ALIGNED(addr, dev_dax->align)) 277 return -EINVAL; 278 return 0; 279 } 280 281 static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) 282 { 283 struct file *filp = vma->vm_file; 284 struct dev_dax *dev_dax = filp->private_data; 285 286 return dev_dax->align; 287 } 288 289 static const struct vm_operations_struct dax_vm_ops = { 290 .fault = dev_dax_fault, 291 .huge_fault = dev_dax_huge_fault, 292 .may_split = dev_dax_may_split, 293 .pagesize = dev_dax_pagesize, 294 }; 295 296 static int dax_mmap_prepare(struct vm_area_desc *desc) 297 { 298 struct file *filp = desc->file; 299 struct dev_dax *dev_dax = filp->private_data; 300 int rc, id; 301 302 dev_dbg(&dev_dax->dev, "trace\n"); 303 304 /* 305 * We lock to check dax_dev liveness and will re-check at 306 * fault time. 307 */ 308 id = dax_read_lock(); 309 rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp, 310 __func__); 311 dax_read_unlock(id); 312 if (rc) 313 return rc; 314 315 desc->vm_ops = &dax_vm_ops; 316 desc->vm_flags |= VM_HUGEPAGE; 317 return 0; 318 } 319 320 /* return an unmapped area aligned to the dax region specified alignment */ 321 static unsigned long dax_get_unmapped_area(struct file *filp, 322 unsigned long addr, unsigned long len, unsigned long pgoff, 323 unsigned long flags) 324 { 325 unsigned long off, off_end, off_align, len_align, addr_align, align; 326 struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 327 328 if (!dev_dax || addr) 329 goto out; 330 331 align = dev_dax->align; 332 off = pgoff << PAGE_SHIFT; 333 off_end = off + len; 334 off_align = round_up(off, align); 335 336 if ((off_end <= off_align) || ((off_end - off_align) < align)) 337 goto out; 338 339 len_align = len + align; 340 if ((off + len_align) < off) 341 goto out; 342 343 addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); 344 if (!IS_ERR_VALUE(addr_align)) { 345 addr_align += (off - addr_align) & (align - 1); 346 return addr_align; 347 } 348 out: 349 return mm_get_unmapped_area(filp, addr, len, pgoff, flags); 350 } 351 352 static const struct address_space_operations dev_dax_aops = { 353 .dirty_folio = noop_dirty_folio, 354 }; 355 356 static int dax_open(struct inode *inode, struct file *filp) 357 { 358 struct dax_device *dax_dev = inode_dax(inode); 359 struct inode *__dax_inode = dax_inode(dax_dev); 360 struct dev_dax *dev_dax = dax_get_private(dax_dev); 361 362 dev_dbg(&dev_dax->dev, "trace\n"); 363 inode->i_mapping = __dax_inode->i_mapping; 364 inode->i_mapping->host = __dax_inode; 365 inode->i_mapping->a_ops = &dev_dax_aops; 366 filp->f_mapping = inode->i_mapping; 367 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 368 filp->f_sb_err = file_sample_sb_err(filp); 369 filp->private_data = dev_dax; 370 inode->i_flags = S_DAX; 371 372 return 0; 373 } 374 375 static int dax_release(struct inode *inode, struct file *filp) 376 { 377 struct dev_dax *dev_dax = filp->private_data; 378 379 dev_dbg(&dev_dax->dev, "trace\n"); 380 return 0; 381 } 382 383 static const struct file_operations dax_fops = { 384 .llseek = noop_llseek, 385 .owner = THIS_MODULE, 386 .open = dax_open, 387 .release = dax_release, 388 .get_unmapped_area = dax_get_unmapped_area, 389 .mmap_prepare = dax_mmap_prepare, 390 .fop_flags = FOP_MMAP_SYNC, 391 }; 392 393 static void dev_dax_cdev_del(void *cdev) 394 { 395 cdev_del(cdev); 396 } 397 398 static void dev_dax_kill(void *dev_dax) 399 { 400 kill_dev_dax(dev_dax); 401 } 402 403 static int dev_dax_probe(struct dev_dax *dev_dax) 404 { 405 struct dax_device *dax_dev = dev_dax->dax_dev; 406 struct device *dev = &dev_dax->dev; 407 struct dev_pagemap *pgmap; 408 struct inode *inode; 409 struct cdev *cdev; 410 void *addr; 411 int rc, i; 412 413 if (static_dev_dax(dev_dax)) { 414 if (dev_dax->nr_range > 1) { 415 dev_warn(dev, 416 "static pgmap / multi-range device conflict\n"); 417 return -EINVAL; 418 } 419 420 pgmap = dev_dax->pgmap; 421 } else { 422 if (dev_dax->pgmap) { 423 dev_warn(dev, 424 "dynamic-dax with pre-populated page map\n"); 425 return -EINVAL; 426 } 427 428 pgmap = devm_kzalloc(dev, 429 struct_size(pgmap, ranges, dev_dax->nr_range - 1), 430 GFP_KERNEL); 431 if (!pgmap) 432 return -ENOMEM; 433 434 pgmap->nr_range = dev_dax->nr_range; 435 dev_dax->pgmap = pgmap; 436 437 for (i = 0; i < dev_dax->nr_range; i++) { 438 struct range *range = &dev_dax->ranges[i].range; 439 pgmap->ranges[i] = *range; 440 } 441 } 442 443 for (i = 0; i < dev_dax->nr_range; i++) { 444 struct range *range = &dev_dax->ranges[i].range; 445 446 if (!devm_request_mem_region(dev, range->start, 447 range_len(range), dev_name(dev))) { 448 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 449 i, range->start, range->end); 450 return -EBUSY; 451 } 452 } 453 454 pgmap->type = MEMORY_DEVICE_GENERIC; 455 if (dev_dax->align > PAGE_SIZE) 456 pgmap->vmemmap_shift = 457 order_base_2(dev_dax->align >> PAGE_SHIFT); 458 addr = devm_memremap_pages(dev, pgmap); 459 if (IS_ERR(addr)) 460 return PTR_ERR(addr); 461 462 inode = dax_inode(dax_dev); 463 cdev = inode->i_cdev; 464 cdev_init(cdev, &dax_fops); 465 cdev->owner = dev->driver->owner; 466 cdev_set_parent(cdev, &dev->kobj); 467 rc = cdev_add(cdev, dev->devt, 1); 468 if (rc) 469 return rc; 470 471 rc = devm_add_action_or_reset(dev, dev_dax_cdev_del, cdev); 472 if (rc) 473 return rc; 474 475 run_dax(dax_dev); 476 return devm_add_action_or_reset(dev, dev_dax_kill, dev_dax); 477 } 478 479 static struct dax_device_driver device_dax_driver = { 480 .probe = dev_dax_probe, 481 .type = DAXDRV_DEVICE_TYPE, 482 }; 483 484 static int __init dax_init(void) 485 { 486 return dax_driver_register(&device_dax_driver); 487 } 488 489 static void __exit dax_exit(void) 490 { 491 dax_driver_unregister(&device_dax_driver); 492 } 493 494 MODULE_AUTHOR("Intel Corporation"); 495 MODULE_DESCRIPTION("Device DAX: direct access device driver"); 496 MODULE_LICENSE("GPL v2"); 497 module_init(dax_init); 498 module_exit(dax_exit); 499 MODULE_ALIAS_DAX_DEVICE(0); 500