1d5406bd4SJohn Groves // SPDX-License-Identifier: GPL-2.0 2d5406bd4SJohn Groves /* Copyright(c) 2026 Micron Technology, Inc. */ 3d5406bd4SJohn Groves #include <linux/memremap.h> 4d5406bd4SJohn Groves #include <linux/pagemap.h> 5d5406bd4SJohn Groves #include <linux/module.h> 6d5406bd4SJohn Groves #include <linux/device.h> 7d5406bd4SJohn Groves #include <linux/cdev.h> 8d5406bd4SJohn Groves #include <linux/slab.h> 9d5406bd4SJohn Groves #include <linux/dax.h> 10d5406bd4SJohn Groves #include <linux/uio.h> 11d5406bd4SJohn Groves #include <linux/fs.h> 12d5406bd4SJohn Groves #include <linux/mm.h> 13d5406bd4SJohn Groves #include "dax-private.h" 14d5406bd4SJohn Groves #include "bus.h" 15d5406bd4SJohn Groves 16d5406bd4SJohn Groves /* 17d5406bd4SJohn Groves * FS-DAX compatible devdax driver 18d5406bd4SJohn Groves * 19d5406bd4SJohn Groves * Unlike drivers/dax/device.c which pre-initializes compound folios based 20d5406bd4SJohn Groves * on device alignment (via vmemmap_shift), this driver leaves folios 21d5406bd4SJohn Groves * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22d5406bd4SJohn Groves * to work without needing special handling for pre-initialized folios. 23d5406bd4SJohn Groves * 24d5406bd4SJohn Groves * Key differences from device.c: 25d5406bd4SJohn Groves * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26d5406bd4SJohn Groves * - vmemmap_shift is NOT set (folios remain order-0) 27d5406bd4SJohn Groves * - fs-dax can dynamically create compound folios as needed 28d5406bd4SJohn Groves * - No mmap support - all access is through fs-dax/iomap 29d5406bd4SJohn Groves */ 30d5406bd4SJohn Groves 31d5406bd4SJohn Groves static void fsdev_cdev_del(void *cdev) 32d5406bd4SJohn Groves { 33d5406bd4SJohn Groves cdev_del(cdev); 34d5406bd4SJohn Groves } 35d5406bd4SJohn Groves 36d5406bd4SJohn Groves static void fsdev_kill(void *dev_dax) 37d5406bd4SJohn Groves { 38d5406bd4SJohn Groves kill_dev_dax(dev_dax); 39d5406bd4SJohn Groves } 40d5406bd4SJohn Groves 41d5406bd4SJohn Groves /* 42d5406bd4SJohn Groves * Page map operations for FS-DAX mode 43d5406bd4SJohn Groves * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 44d5406bd4SJohn Groves * 45d5406bd4SJohn Groves * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 46d5406bd4SJohn Groves * The core mm code in free_zone_device_folio() handles the wake_up_var() 47d5406bd4SJohn Groves * directly for this memory type. 48d5406bd4SJohn Groves */ 49d5406bd4SJohn Groves static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 50d5406bd4SJohn Groves unsigned long pfn, unsigned long nr_pages, int mf_flags) 51d5406bd4SJohn Groves { 52d5406bd4SJohn Groves struct dev_dax *dev_dax = pgmap->owner; 53d5406bd4SJohn Groves u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 54d5406bd4SJohn Groves u64 len = nr_pages << PAGE_SHIFT; 55d5406bd4SJohn Groves 56d5406bd4SJohn Groves return dax_holder_notify_failure(dev_dax->dax_dev, offset, 57d5406bd4SJohn Groves len, mf_flags); 58d5406bd4SJohn Groves } 59d5406bd4SJohn Groves 60d5406bd4SJohn Groves static const struct dev_pagemap_ops fsdev_pagemap_ops = { 61d5406bd4SJohn Groves .memory_failure = fsdev_pagemap_memory_failure, 62d5406bd4SJohn Groves }; 63d5406bd4SJohn Groves 64d5406bd4SJohn Groves /* 65d5406bd4SJohn Groves * Clear any stale folio state from pages in the given range. 66d5406bd4SJohn Groves * This is necessary because device_dax pre-initializes compound folios 67d5406bd4SJohn Groves * based on vmemmap_shift, and that state may persist after driver unbind. 68d5406bd4SJohn Groves * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 69d5406bd4SJohn Groves * expects to find clean order-0 folios that it can build into compound 70d5406bd4SJohn Groves * folios on demand. 71d5406bd4SJohn Groves * 72d5406bd4SJohn Groves * At probe time, no filesystem should be mounted yet, so all mappings 73d5406bd4SJohn Groves * are stale and must be cleared along with compound state. 74d5406bd4SJohn Groves */ 75d5406bd4SJohn Groves static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 76d5406bd4SJohn Groves { 77d5406bd4SJohn Groves for (int i = 0; i < dev_dax->nr_range; i++) { 78d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 79d5406bd4SJohn Groves unsigned long pfn = PHYS_PFN(range->start); 80d5406bd4SJohn Groves unsigned long end_pfn = PHYS_PFN(range->end) + 1; 81d5406bd4SJohn Groves 82d5406bd4SJohn Groves while (pfn < end_pfn) { 83d5406bd4SJohn Groves struct folio *folio = pfn_folio(pfn); 84d5406bd4SJohn Groves int order = dax_folio_reset_order(folio); 85d5406bd4SJohn Groves 86d5406bd4SJohn Groves pfn += 1UL << order; 87d5406bd4SJohn Groves } 88d5406bd4SJohn Groves } 89d5406bd4SJohn Groves } 90d5406bd4SJohn Groves 91d5406bd4SJohn Groves static void fsdev_clear_folio_state_action(void *data) 92d5406bd4SJohn Groves { 93d5406bd4SJohn Groves fsdev_clear_folio_state(data); 94d5406bd4SJohn Groves } 95d5406bd4SJohn Groves 96d5406bd4SJohn Groves static int fsdev_open(struct inode *inode, struct file *filp) 97d5406bd4SJohn Groves { 98d5406bd4SJohn Groves struct dax_device *dax_dev = inode_dax(inode); 99d5406bd4SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 100d5406bd4SJohn Groves 101d5406bd4SJohn Groves filp->private_data = dev_dax; 102d5406bd4SJohn Groves 103d5406bd4SJohn Groves return 0; 104d5406bd4SJohn Groves } 105d5406bd4SJohn Groves 106d5406bd4SJohn Groves static int fsdev_release(struct inode *inode, struct file *filp) 107d5406bd4SJohn Groves { 108d5406bd4SJohn Groves return 0; 109d5406bd4SJohn Groves } 110d5406bd4SJohn Groves 111d5406bd4SJohn Groves static const struct file_operations fsdev_fops = { 112d5406bd4SJohn Groves .llseek = noop_llseek, 113d5406bd4SJohn Groves .owner = THIS_MODULE, 114d5406bd4SJohn Groves .open = fsdev_open, 115d5406bd4SJohn Groves .release = fsdev_release, 116d5406bd4SJohn Groves }; 117d5406bd4SJohn Groves 118d5406bd4SJohn Groves static int fsdev_dax_probe(struct dev_dax *dev_dax) 119d5406bd4SJohn Groves { 120d5406bd4SJohn Groves struct dax_device *dax_dev = dev_dax->dax_dev; 121d5406bd4SJohn Groves struct device *dev = &dev_dax->dev; 122d5406bd4SJohn Groves struct dev_pagemap *pgmap; 123d5406bd4SJohn Groves struct inode *inode; 124*75945584SJohn Groves u64 data_offset = 0; 125d5406bd4SJohn Groves struct cdev *cdev; 126d5406bd4SJohn Groves void *addr; 127d5406bd4SJohn Groves int rc, i; 128d5406bd4SJohn Groves 129d5406bd4SJohn Groves if (static_dev_dax(dev_dax)) { 130d5406bd4SJohn Groves if (dev_dax->nr_range > 1) { 131d5406bd4SJohn Groves dev_warn(dev, "static pgmap / multi-range device conflict\n"); 132d5406bd4SJohn Groves return -EINVAL; 133d5406bd4SJohn Groves } 134d5406bd4SJohn Groves 135d5406bd4SJohn Groves pgmap = dev_dax->pgmap; 136d5406bd4SJohn Groves } else { 137d5406bd4SJohn Groves size_t pgmap_size; 138d5406bd4SJohn Groves 139d5406bd4SJohn Groves if (dev_dax->pgmap) { 140d5406bd4SJohn Groves dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 141d5406bd4SJohn Groves return -EINVAL; 142d5406bd4SJohn Groves } 143d5406bd4SJohn Groves 144d5406bd4SJohn Groves pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 145d5406bd4SJohn Groves pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 146d5406bd4SJohn Groves if (!pgmap) 147d5406bd4SJohn Groves return -ENOMEM; 148d5406bd4SJohn Groves 149d5406bd4SJohn Groves pgmap->nr_range = dev_dax->nr_range; 150d5406bd4SJohn Groves dev_dax->pgmap = pgmap; 151d5406bd4SJohn Groves 152d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 153d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 154d5406bd4SJohn Groves 155d5406bd4SJohn Groves pgmap->ranges[i] = *range; 156d5406bd4SJohn Groves } 157d5406bd4SJohn Groves } 158d5406bd4SJohn Groves 159d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 160d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 161d5406bd4SJohn Groves 162d5406bd4SJohn Groves if (!devm_request_mem_region(dev, range->start, 163d5406bd4SJohn Groves range_len(range), dev_name(dev))) { 164d5406bd4SJohn Groves dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 165d5406bd4SJohn Groves i, range->start, range->end); 166d5406bd4SJohn Groves return -EBUSY; 167d5406bd4SJohn Groves } 168d5406bd4SJohn Groves } 169d5406bd4SJohn Groves 170d5406bd4SJohn Groves /* 171d5406bd4SJohn Groves * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 172d5406bd4SJohn Groves * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 173d5406bd4SJohn Groves * lets fs-dax dynamically build compound folios as needed, similar 174d5406bd4SJohn Groves * to pmem behavior. 175d5406bd4SJohn Groves */ 176d5406bd4SJohn Groves pgmap->type = MEMORY_DEVICE_FS_DAX; 177d5406bd4SJohn Groves pgmap->ops = &fsdev_pagemap_ops; 178d5406bd4SJohn Groves pgmap->owner = dev_dax; 179d5406bd4SJohn Groves 180d5406bd4SJohn Groves addr = devm_memremap_pages(dev, pgmap); 181d5406bd4SJohn Groves if (IS_ERR(addr)) 182d5406bd4SJohn Groves return PTR_ERR(addr); 183d5406bd4SJohn Groves 184d5406bd4SJohn Groves /* 185d5406bd4SJohn Groves * Clear any stale compound folio state left over from a previous 186d5406bd4SJohn Groves * driver (e.g., device_dax with vmemmap_shift). Also register this 187d5406bd4SJohn Groves * as a devm action so folio state is cleared on unbind, ensuring 188d5406bd4SJohn Groves * clean pages for subsequent drivers (e.g., kmem for system-ram). 189d5406bd4SJohn Groves */ 190d5406bd4SJohn Groves fsdev_clear_folio_state(dev_dax); 191d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 192d5406bd4SJohn Groves dev_dax); 193d5406bd4SJohn Groves if (rc) 194d5406bd4SJohn Groves return rc; 195d5406bd4SJohn Groves 196d5406bd4SJohn Groves /* Detect whether the data is at a non-zero offset into the memory */ 197d5406bd4SJohn Groves if (pgmap->range.start != dev_dax->ranges[0].range.start) { 198d5406bd4SJohn Groves u64 phys = dev_dax->ranges[0].range.start; 199d5406bd4SJohn Groves u64 pgmap_phys = dev_dax->pgmap[0].range.start; 200d5406bd4SJohn Groves 201d5406bd4SJohn Groves if (!WARN_ON(pgmap_phys > phys)) 202d5406bd4SJohn Groves data_offset = phys - pgmap_phys; 203d5406bd4SJohn Groves 204d5406bd4SJohn Groves pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 205d5406bd4SJohn Groves __func__, phys, pgmap_phys, data_offset); 206d5406bd4SJohn Groves } 207*75945584SJohn Groves dev_dax->virt_addr = addr + data_offset; 208d5406bd4SJohn Groves 209d5406bd4SJohn Groves inode = dax_inode(dax_dev); 210d5406bd4SJohn Groves cdev = inode->i_cdev; 211d5406bd4SJohn Groves cdev_init(cdev, &fsdev_fops); 212d5406bd4SJohn Groves cdev->owner = dev->driver->owner; 213d5406bd4SJohn Groves cdev_set_parent(cdev, &dev->kobj); 214d5406bd4SJohn Groves rc = cdev_add(cdev, dev->devt, 1); 215d5406bd4SJohn Groves if (rc) 216d5406bd4SJohn Groves return rc; 217d5406bd4SJohn Groves 218d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 219d5406bd4SJohn Groves if (rc) 220d5406bd4SJohn Groves return rc; 221d5406bd4SJohn Groves 222d5406bd4SJohn Groves run_dax(dax_dev); 223d5406bd4SJohn Groves return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 224d5406bd4SJohn Groves } 225d5406bd4SJohn Groves 226d5406bd4SJohn Groves static struct dax_device_driver fsdev_dax_driver = { 227d5406bd4SJohn Groves .probe = fsdev_dax_probe, 228d5406bd4SJohn Groves .type = DAXDRV_FSDEV_TYPE, 229d5406bd4SJohn Groves }; 230d5406bd4SJohn Groves 231d5406bd4SJohn Groves static int __init dax_init(void) 232d5406bd4SJohn Groves { 233d5406bd4SJohn Groves return dax_driver_register(&fsdev_dax_driver); 234d5406bd4SJohn Groves } 235d5406bd4SJohn Groves 236d5406bd4SJohn Groves static void __exit dax_exit(void) 237d5406bd4SJohn Groves { 238d5406bd4SJohn Groves dax_driver_unregister(&fsdev_dax_driver); 239d5406bd4SJohn Groves } 240d5406bd4SJohn Groves 241d5406bd4SJohn Groves MODULE_AUTHOR("John Groves"); 242d5406bd4SJohn Groves MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 243d5406bd4SJohn Groves MODULE_LICENSE("GPL"); 244d5406bd4SJohn Groves module_init(dax_init); 245d5406bd4SJohn Groves module_exit(dax_exit); 246d5406bd4SJohn Groves MODULE_ALIAS_DAX_DEVICE(0); 247