1 /* 2 * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/pagemap.h> 14 #include <linux/module.h> 15 #include <linux/device.h> 16 #include <linux/pfn_t.h> 17 #include <linux/cdev.h> 18 #include <linux/slab.h> 19 #include <linux/dax.h> 20 #include <linux/fs.h> 21 #include <linux/mm.h> 22 #include "dax-private.h" 23 #include "dax.h" 24 25 static struct class *dax_class; 26 27 /* 28 * Rely on the fact that drvdata is set before the attributes are 29 * registered, and that the attributes are unregistered before drvdata 30 * is cleared to assume that drvdata is always valid. 31 */ 32 static ssize_t id_show(struct device *dev, 33 struct device_attribute *attr, char *buf) 34 { 35 struct dax_region *dax_region = dev_get_drvdata(dev); 36 37 return sprintf(buf, "%d\n", dax_region->id); 38 } 39 static DEVICE_ATTR_RO(id); 40 41 static ssize_t region_size_show(struct device *dev, 42 struct device_attribute *attr, char *buf) 43 { 44 struct dax_region *dax_region = dev_get_drvdata(dev); 45 46 return sprintf(buf, "%llu\n", (unsigned long long) 47 resource_size(&dax_region->res)); 48 } 49 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, 50 region_size_show, NULL); 51 52 static ssize_t align_show(struct device *dev, 53 struct device_attribute *attr, char *buf) 54 { 55 struct dax_region *dax_region = dev_get_drvdata(dev); 56 57 return sprintf(buf, "%u\n", dax_region->align); 58 } 59 static DEVICE_ATTR_RO(align); 60 61 static struct attribute *dax_region_attributes[] = { 62 &dev_attr_region_size.attr, 63 &dev_attr_align.attr, 64 &dev_attr_id.attr, 65 NULL, 66 }; 67 68 static const struct attribute_group dax_region_attribute_group = { 69 .name = "dax_region", 70 .attrs = dax_region_attributes, 71 }; 72 73 static const struct attribute_group *dax_region_attribute_groups[] = { 74 &dax_region_attribute_group, 75 NULL, 76 }; 77 78 static void dax_region_free(struct kref *kref) 79 { 80 struct dax_region *dax_region; 81 82 dax_region = container_of(kref, struct dax_region, kref); 83 kfree(dax_region); 84 } 85 86 void dax_region_put(struct dax_region *dax_region) 87 { 88 kref_put(&dax_region->kref, dax_region_free); 89 } 90 EXPORT_SYMBOL_GPL(dax_region_put); 91 92 static void dax_region_unregister(void *region) 93 { 94 struct dax_region *dax_region = region; 95 96 sysfs_remove_groups(&dax_region->dev->kobj, 97 dax_region_attribute_groups); 98 dax_region_put(dax_region); 99 } 100 101 struct dax_region *alloc_dax_region(struct device *parent, int region_id, 102 struct resource *res, unsigned int align, void *addr, 103 unsigned long pfn_flags) 104 { 105 struct dax_region *dax_region; 106 107 /* 108 * The DAX core assumes that it can store its private data in 109 * parent->driver_data. This WARN is a reminder / safeguard for 110 * developers of device-dax drivers. 111 */ 112 if (dev_get_drvdata(parent)) { 113 dev_WARN(parent, "dax core failed to setup private data\n"); 114 return NULL; 115 } 116 117 if (!IS_ALIGNED(res->start, align) 118 || !IS_ALIGNED(resource_size(res), align)) 119 return NULL; 120 121 dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); 122 if (!dax_region) 123 return NULL; 124 125 dev_set_drvdata(parent, dax_region); 126 memcpy(&dax_region->res, res, sizeof(*res)); 127 dax_region->pfn_flags = pfn_flags; 128 kref_init(&dax_region->kref); 129 dax_region->id = region_id; 130 ida_init(&dax_region->ida); 131 dax_region->align = align; 132 dax_region->dev = parent; 133 dax_region->base = addr; 134 if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { 135 kfree(dax_region); 136 return NULL;; 137 } 138 139 kref_get(&dax_region->kref); 140 if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) 141 return NULL; 142 return dax_region; 143 } 144 EXPORT_SYMBOL_GPL(alloc_dax_region); 145 146 static struct dev_dax *to_dev_dax(struct device *dev) 147 { 148 return container_of(dev, struct dev_dax, dev); 149 } 150 151 static ssize_t size_show(struct device *dev, 152 struct device_attribute *attr, char *buf) 153 { 154 struct dev_dax *dev_dax = to_dev_dax(dev); 155 unsigned long long size = 0; 156 int i; 157 158 for (i = 0; i < dev_dax->num_resources; i++) 159 size += resource_size(&dev_dax->res[i]); 160 161 return sprintf(buf, "%llu\n", size); 162 } 163 static DEVICE_ATTR_RO(size); 164 165 static struct attribute *dev_dax_attributes[] = { 166 &dev_attr_size.attr, 167 NULL, 168 }; 169 170 static const struct attribute_group dev_dax_attribute_group = { 171 .attrs = dev_dax_attributes, 172 }; 173 174 static const struct attribute_group *dax_attribute_groups[] = { 175 &dev_dax_attribute_group, 176 NULL, 177 }; 178 179 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 180 const char *func) 181 { 182 struct dax_region *dax_region = dev_dax->region; 183 struct device *dev = &dev_dax->dev; 184 unsigned long mask; 185 186 if (!dax_alive(dev_dax->dax_dev)) 187 return -ENXIO; 188 189 /* prevent private mappings from being established */ 190 if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { 191 dev_info(dev, "%s: %s: fail, attempted private mapping\n", 192 current->comm, func); 193 return -EINVAL; 194 } 195 196 mask = dax_region->align - 1; 197 if (vma->vm_start & mask || vma->vm_end & mask) { 198 dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", 199 current->comm, func, vma->vm_start, vma->vm_end, 200 mask); 201 return -EINVAL; 202 } 203 204 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV 205 && (vma->vm_flags & VM_DONTCOPY) == 0) { 206 dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", 207 current->comm, func); 208 return -EINVAL; 209 } 210 211 if (!vma_is_dax(vma)) { 212 dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", 213 current->comm, func); 214 return -EINVAL; 215 } 216 217 return 0; 218 } 219 220 /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 221 __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 222 unsigned long size) 223 { 224 struct resource *res; 225 /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */ 226 phys_addr_t uninitialized_var(phys); 227 int i; 228 229 for (i = 0; i < dev_dax->num_resources; i++) { 230 res = &dev_dax->res[i]; 231 phys = pgoff * PAGE_SIZE + res->start; 232 if (phys >= res->start && phys <= res->end) 233 break; 234 pgoff -= PHYS_PFN(resource_size(res)); 235 } 236 237 if (i < dev_dax->num_resources) { 238 res = &dev_dax->res[i]; 239 if (phys + size - 1 <= res->end) 240 return phys; 241 } 242 243 return -1; 244 } 245 246 static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 247 { 248 struct device *dev = &dev_dax->dev; 249 struct dax_region *dax_region; 250 int rc = VM_FAULT_SIGBUS; 251 phys_addr_t phys; 252 pfn_t pfn; 253 unsigned int fault_size = PAGE_SIZE; 254 255 if (check_vma(dev_dax, vmf->vma, __func__)) 256 return VM_FAULT_SIGBUS; 257 258 dax_region = dev_dax->region; 259 if (dax_region->align > PAGE_SIZE) { 260 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 261 __func__, dax_region->align, fault_size); 262 return VM_FAULT_SIGBUS; 263 } 264 265 if (fault_size != dax_region->align) 266 return VM_FAULT_SIGBUS; 267 268 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 269 if (phys == -1) { 270 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 271 vmf->pgoff); 272 return VM_FAULT_SIGBUS; 273 } 274 275 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 276 277 rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); 278 279 if (rc == -ENOMEM) 280 return VM_FAULT_OOM; 281 if (rc < 0 && rc != -EBUSY) 282 return VM_FAULT_SIGBUS; 283 284 return VM_FAULT_NOPAGE; 285 } 286 287 static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 288 { 289 unsigned long pmd_addr = vmf->address & PMD_MASK; 290 struct device *dev = &dev_dax->dev; 291 struct dax_region *dax_region; 292 phys_addr_t phys; 293 pgoff_t pgoff; 294 pfn_t pfn; 295 unsigned int fault_size = PMD_SIZE; 296 297 if (check_vma(dev_dax, vmf->vma, __func__)) 298 return VM_FAULT_SIGBUS; 299 300 dax_region = dev_dax->region; 301 if (dax_region->align > PMD_SIZE) { 302 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 303 __func__, dax_region->align, fault_size); 304 return VM_FAULT_SIGBUS; 305 } 306 307 /* dax pmd mappings require pfn_t_devmap() */ 308 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 309 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 310 return VM_FAULT_SIGBUS; 311 } 312 313 if (fault_size < dax_region->align) 314 return VM_FAULT_SIGBUS; 315 else if (fault_size > dax_region->align) 316 return VM_FAULT_FALLBACK; 317 318 /* if we are outside of the VMA */ 319 if (pmd_addr < vmf->vma->vm_start || 320 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 321 return VM_FAULT_SIGBUS; 322 323 pgoff = linear_page_index(vmf->vma, pmd_addr); 324 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 325 if (phys == -1) { 326 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 327 pgoff); 328 return VM_FAULT_SIGBUS; 329 } 330 331 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 332 333 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, 334 vmf->flags & FAULT_FLAG_WRITE); 335 } 336 337 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 338 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 339 { 340 unsigned long pud_addr = vmf->address & PUD_MASK; 341 struct device *dev = &dev_dax->dev; 342 struct dax_region *dax_region; 343 phys_addr_t phys; 344 pgoff_t pgoff; 345 pfn_t pfn; 346 unsigned int fault_size = PUD_SIZE; 347 348 349 if (check_vma(dev_dax, vmf->vma, __func__)) 350 return VM_FAULT_SIGBUS; 351 352 dax_region = dev_dax->region; 353 if (dax_region->align > PUD_SIZE) { 354 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 355 __func__, dax_region->align, fault_size); 356 return VM_FAULT_SIGBUS; 357 } 358 359 /* dax pud mappings require pfn_t_devmap() */ 360 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 361 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 362 return VM_FAULT_SIGBUS; 363 } 364 365 if (fault_size < dax_region->align) 366 return VM_FAULT_SIGBUS; 367 else if (fault_size > dax_region->align) 368 return VM_FAULT_FALLBACK; 369 370 /* if we are outside of the VMA */ 371 if (pud_addr < vmf->vma->vm_start || 372 (pud_addr + PUD_SIZE) > vmf->vma->vm_end) 373 return VM_FAULT_SIGBUS; 374 375 pgoff = linear_page_index(vmf->vma, pud_addr); 376 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 377 if (phys == -1) { 378 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 379 pgoff); 380 return VM_FAULT_SIGBUS; 381 } 382 383 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 384 385 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, 386 vmf->flags & FAULT_FLAG_WRITE); 387 } 388 #else 389 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 390 { 391 return VM_FAULT_FALLBACK; 392 } 393 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 394 395 static int dev_dax_huge_fault(struct vm_fault *vmf, 396 enum page_entry_size pe_size) 397 { 398 int rc, id; 399 struct file *filp = vmf->vma->vm_file; 400 struct dev_dax *dev_dax = filp->private_data; 401 402 dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, 403 current->comm, (vmf->flags & FAULT_FLAG_WRITE) 404 ? "write" : "read", 405 vmf->vma->vm_start, vmf->vma->vm_end, pe_size); 406 407 id = dax_read_lock(); 408 switch (pe_size) { 409 case PE_SIZE_PTE: 410 rc = __dev_dax_pte_fault(dev_dax, vmf); 411 break; 412 case PE_SIZE_PMD: 413 rc = __dev_dax_pmd_fault(dev_dax, vmf); 414 break; 415 case PE_SIZE_PUD: 416 rc = __dev_dax_pud_fault(dev_dax, vmf); 417 break; 418 default: 419 rc = VM_FAULT_SIGBUS; 420 } 421 dax_read_unlock(id); 422 423 return rc; 424 } 425 426 static int dev_dax_fault(struct vm_fault *vmf) 427 { 428 return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 429 } 430 431 static const struct vm_operations_struct dax_vm_ops = { 432 .fault = dev_dax_fault, 433 .huge_fault = dev_dax_huge_fault, 434 }; 435 436 static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 437 { 438 struct dev_dax *dev_dax = filp->private_data; 439 int rc, id; 440 441 dev_dbg(&dev_dax->dev, "%s\n", __func__); 442 443 /* 444 * We lock to check dax_dev liveness and will re-check at 445 * fault time. 446 */ 447 id = dax_read_lock(); 448 rc = check_vma(dev_dax, vma, __func__); 449 dax_read_unlock(id); 450 if (rc) 451 return rc; 452 453 vma->vm_ops = &dax_vm_ops; 454 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 455 return 0; 456 } 457 458 /* return an unmapped area aligned to the dax region specified alignment */ 459 static unsigned long dax_get_unmapped_area(struct file *filp, 460 unsigned long addr, unsigned long len, unsigned long pgoff, 461 unsigned long flags) 462 { 463 unsigned long off, off_end, off_align, len_align, addr_align, align; 464 struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 465 struct dax_region *dax_region; 466 467 if (!dev_dax || addr) 468 goto out; 469 470 dax_region = dev_dax->region; 471 align = dax_region->align; 472 off = pgoff << PAGE_SHIFT; 473 off_end = off + len; 474 off_align = round_up(off, align); 475 476 if ((off_end <= off_align) || ((off_end - off_align) < align)) 477 goto out; 478 479 len_align = len + align; 480 if ((off + len_align) < off) 481 goto out; 482 483 addr_align = current->mm->get_unmapped_area(filp, addr, len_align, 484 pgoff, flags); 485 if (!IS_ERR_VALUE(addr_align)) { 486 addr_align += (off - addr_align) & (align - 1); 487 return addr_align; 488 } 489 out: 490 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 491 } 492 493 static int dax_open(struct inode *inode, struct file *filp) 494 { 495 struct dax_device *dax_dev = inode_dax(inode); 496 struct inode *__dax_inode = dax_inode(dax_dev); 497 struct dev_dax *dev_dax = dax_get_private(dax_dev); 498 499 dev_dbg(&dev_dax->dev, "%s\n", __func__); 500 inode->i_mapping = __dax_inode->i_mapping; 501 inode->i_mapping->host = __dax_inode; 502 filp->f_mapping = inode->i_mapping; 503 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 504 filp->private_data = dev_dax; 505 inode->i_flags = S_DAX; 506 507 return 0; 508 } 509 510 static int dax_release(struct inode *inode, struct file *filp) 511 { 512 struct dev_dax *dev_dax = filp->private_data; 513 514 dev_dbg(&dev_dax->dev, "%s\n", __func__); 515 return 0; 516 } 517 518 static const struct file_operations dax_fops = { 519 .llseek = noop_llseek, 520 .owner = THIS_MODULE, 521 .open = dax_open, 522 .release = dax_release, 523 .get_unmapped_area = dax_get_unmapped_area, 524 .mmap = dax_mmap, 525 }; 526 527 static void dev_dax_release(struct device *dev) 528 { 529 struct dev_dax *dev_dax = to_dev_dax(dev); 530 struct dax_region *dax_region = dev_dax->region; 531 struct dax_device *dax_dev = dev_dax->dax_dev; 532 533 if (dev_dax->id >= 0) 534 ida_simple_remove(&dax_region->ida, dev_dax->id); 535 dax_region_put(dax_region); 536 put_dax(dax_dev); 537 kfree(dev_dax); 538 } 539 540 static void kill_dev_dax(struct dev_dax *dev_dax) 541 { 542 struct dax_device *dax_dev = dev_dax->dax_dev; 543 struct inode *inode = dax_inode(dax_dev); 544 545 kill_dax(dax_dev); 546 unmap_mapping_range(inode->i_mapping, 0, 0, 1); 547 } 548 549 static void unregister_dev_dax(void *dev) 550 { 551 struct dev_dax *dev_dax = to_dev_dax(dev); 552 struct dax_device *dax_dev = dev_dax->dax_dev; 553 struct inode *inode = dax_inode(dax_dev); 554 struct cdev *cdev = inode->i_cdev; 555 556 dev_dbg(dev, "%s\n", __func__); 557 558 kill_dev_dax(dev_dax); 559 cdev_device_del(cdev, dev); 560 put_device(dev); 561 } 562 563 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, 564 int id, struct resource *res, int count) 565 { 566 struct device *parent = dax_region->dev; 567 struct dax_device *dax_dev; 568 struct dev_dax *dev_dax; 569 struct inode *inode; 570 struct device *dev; 571 struct cdev *cdev; 572 int rc, i; 573 574 if (!count) 575 return ERR_PTR(-EINVAL); 576 577 dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); 578 if (!dev_dax) 579 return ERR_PTR(-ENOMEM); 580 581 for (i = 0; i < count; i++) { 582 if (!IS_ALIGNED(res[i].start, dax_region->align) 583 || !IS_ALIGNED(resource_size(&res[i]), 584 dax_region->align)) { 585 rc = -EINVAL; 586 break; 587 } 588 dev_dax->res[i].start = res[i].start; 589 dev_dax->res[i].end = res[i].end; 590 } 591 592 if (i < count) 593 goto err_id; 594 595 if (id < 0) { 596 id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 597 dev_dax->id = id; 598 if (id < 0) { 599 rc = id; 600 goto err_id; 601 } 602 } else { 603 /* region provider owns @id lifetime */ 604 dev_dax->id = -1; 605 } 606 607 /* 608 * No 'host' or dax_operations since there is no access to this 609 * device outside of mmap of the resulting character device. 610 */ 611 dax_dev = alloc_dax(dev_dax, NULL, NULL); 612 if (!dax_dev) { 613 rc = -ENOMEM; 614 goto err_dax; 615 } 616 617 /* from here on we're committed to teardown via dax_dev_release() */ 618 dev = &dev_dax->dev; 619 device_initialize(dev); 620 621 inode = dax_inode(dax_dev); 622 cdev = inode->i_cdev; 623 cdev_init(cdev, &dax_fops); 624 cdev->owner = parent->driver->owner; 625 626 dev_dax->num_resources = count; 627 dev_dax->dax_dev = dax_dev; 628 dev_dax->region = dax_region; 629 kref_get(&dax_region->kref); 630 631 dev->devt = inode->i_rdev; 632 dev->class = dax_class; 633 dev->parent = parent; 634 dev->groups = dax_attribute_groups; 635 dev->release = dev_dax_release; 636 dev_set_name(dev, "dax%d.%d", dax_region->id, id); 637 638 rc = cdev_device_add(cdev, dev); 639 if (rc) { 640 kill_dev_dax(dev_dax); 641 put_device(dev); 642 return ERR_PTR(rc); 643 } 644 645 rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); 646 if (rc) 647 return ERR_PTR(rc); 648 649 return dev_dax; 650 651 err_dax: 652 if (dev_dax->id >= 0) 653 ida_simple_remove(&dax_region->ida, dev_dax->id); 654 err_id: 655 kfree(dev_dax); 656 657 return ERR_PTR(rc); 658 } 659 EXPORT_SYMBOL_GPL(devm_create_dev_dax); 660 661 static int __init dax_init(void) 662 { 663 dax_class = class_create(THIS_MODULE, "dax"); 664 return PTR_ERR_OR_ZERO(dax_class); 665 } 666 667 static void __exit dax_exit(void) 668 { 669 class_destroy(dax_class); 670 } 671 672 MODULE_AUTHOR("Intel Corporation"); 673 MODULE_LICENSE("GPL v2"); 674 subsys_initcall(dax_init); 675 module_exit(dax_exit); 676