1*d5406bd4SJohn Groves // SPDX-License-Identifier: GPL-2.0 2*d5406bd4SJohn Groves /* Copyright(c) 2026 Micron Technology, Inc. */ 3*d5406bd4SJohn Groves #include <linux/memremap.h> 4*d5406bd4SJohn Groves #include <linux/pagemap.h> 5*d5406bd4SJohn Groves #include <linux/module.h> 6*d5406bd4SJohn Groves #include <linux/device.h> 7*d5406bd4SJohn Groves #include <linux/cdev.h> 8*d5406bd4SJohn Groves #include <linux/slab.h> 9*d5406bd4SJohn Groves #include <linux/dax.h> 10*d5406bd4SJohn Groves #include <linux/uio.h> 11*d5406bd4SJohn Groves #include <linux/fs.h> 12*d5406bd4SJohn Groves #include <linux/mm.h> 13*d5406bd4SJohn Groves #include "dax-private.h" 14*d5406bd4SJohn Groves #include "bus.h" 15*d5406bd4SJohn Groves 16*d5406bd4SJohn Groves /* 17*d5406bd4SJohn Groves * FS-DAX compatible devdax driver 18*d5406bd4SJohn Groves * 19*d5406bd4SJohn Groves * Unlike drivers/dax/device.c which pre-initializes compound folios based 20*d5406bd4SJohn Groves * on device alignment (via vmemmap_shift), this driver leaves folios 21*d5406bd4SJohn Groves * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22*d5406bd4SJohn Groves * to work without needing special handling for pre-initialized folios. 23*d5406bd4SJohn Groves * 24*d5406bd4SJohn Groves * Key differences from device.c: 25*d5406bd4SJohn Groves * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26*d5406bd4SJohn Groves * - vmemmap_shift is NOT set (folios remain order-0) 27*d5406bd4SJohn Groves * - fs-dax can dynamically create compound folios as needed 28*d5406bd4SJohn Groves * - No mmap support - all access is through fs-dax/iomap 29*d5406bd4SJohn Groves */ 30*d5406bd4SJohn Groves 31*d5406bd4SJohn Groves static void fsdev_cdev_del(void *cdev) 32*d5406bd4SJohn Groves { 33*d5406bd4SJohn Groves cdev_del(cdev); 34*d5406bd4SJohn Groves } 35*d5406bd4SJohn Groves 36*d5406bd4SJohn Groves static void fsdev_kill(void *dev_dax) 37*d5406bd4SJohn Groves { 38*d5406bd4SJohn Groves kill_dev_dax(dev_dax); 39*d5406bd4SJohn Groves } 40*d5406bd4SJohn Groves 41*d5406bd4SJohn Groves /* 42*d5406bd4SJohn Groves * Page map operations for FS-DAX mode 43*d5406bd4SJohn Groves * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 44*d5406bd4SJohn Groves * 45*d5406bd4SJohn Groves * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 46*d5406bd4SJohn Groves * The core mm code in free_zone_device_folio() handles the wake_up_var() 47*d5406bd4SJohn Groves * directly for this memory type. 48*d5406bd4SJohn Groves */ 49*d5406bd4SJohn Groves static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 50*d5406bd4SJohn Groves unsigned long pfn, unsigned long nr_pages, int mf_flags) 51*d5406bd4SJohn Groves { 52*d5406bd4SJohn Groves struct dev_dax *dev_dax = pgmap->owner; 53*d5406bd4SJohn Groves u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 54*d5406bd4SJohn Groves u64 len = nr_pages << PAGE_SHIFT; 55*d5406bd4SJohn Groves 56*d5406bd4SJohn Groves return dax_holder_notify_failure(dev_dax->dax_dev, offset, 57*d5406bd4SJohn Groves len, mf_flags); 58*d5406bd4SJohn Groves } 59*d5406bd4SJohn Groves 60*d5406bd4SJohn Groves static const struct dev_pagemap_ops fsdev_pagemap_ops = { 61*d5406bd4SJohn Groves .memory_failure = fsdev_pagemap_memory_failure, 62*d5406bd4SJohn Groves }; 63*d5406bd4SJohn Groves 64*d5406bd4SJohn Groves /* 65*d5406bd4SJohn Groves * Clear any stale folio state from pages in the given range. 66*d5406bd4SJohn Groves * This is necessary because device_dax pre-initializes compound folios 67*d5406bd4SJohn Groves * based on vmemmap_shift, and that state may persist after driver unbind. 68*d5406bd4SJohn Groves * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 69*d5406bd4SJohn Groves * expects to find clean order-0 folios that it can build into compound 70*d5406bd4SJohn Groves * folios on demand. 71*d5406bd4SJohn Groves * 72*d5406bd4SJohn Groves * At probe time, no filesystem should be mounted yet, so all mappings 73*d5406bd4SJohn Groves * are stale and must be cleared along with compound state. 74*d5406bd4SJohn Groves */ 75*d5406bd4SJohn Groves static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 76*d5406bd4SJohn Groves { 77*d5406bd4SJohn Groves for (int i = 0; i < dev_dax->nr_range; i++) { 78*d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 79*d5406bd4SJohn Groves unsigned long pfn = PHYS_PFN(range->start); 80*d5406bd4SJohn Groves unsigned long end_pfn = PHYS_PFN(range->end) + 1; 81*d5406bd4SJohn Groves 82*d5406bd4SJohn Groves while (pfn < end_pfn) { 83*d5406bd4SJohn Groves struct folio *folio = pfn_folio(pfn); 84*d5406bd4SJohn Groves int order = dax_folio_reset_order(folio); 85*d5406bd4SJohn Groves 86*d5406bd4SJohn Groves pfn += 1UL << order; 87*d5406bd4SJohn Groves } 88*d5406bd4SJohn Groves } 89*d5406bd4SJohn Groves } 90*d5406bd4SJohn Groves 91*d5406bd4SJohn Groves static void fsdev_clear_folio_state_action(void *data) 92*d5406bd4SJohn Groves { 93*d5406bd4SJohn Groves fsdev_clear_folio_state(data); 94*d5406bd4SJohn Groves } 95*d5406bd4SJohn Groves 96*d5406bd4SJohn Groves static int fsdev_open(struct inode *inode, struct file *filp) 97*d5406bd4SJohn Groves { 98*d5406bd4SJohn Groves struct dax_device *dax_dev = inode_dax(inode); 99*d5406bd4SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 100*d5406bd4SJohn Groves 101*d5406bd4SJohn Groves filp->private_data = dev_dax; 102*d5406bd4SJohn Groves 103*d5406bd4SJohn Groves return 0; 104*d5406bd4SJohn Groves } 105*d5406bd4SJohn Groves 106*d5406bd4SJohn Groves static int fsdev_release(struct inode *inode, struct file *filp) 107*d5406bd4SJohn Groves { 108*d5406bd4SJohn Groves return 0; 109*d5406bd4SJohn Groves } 110*d5406bd4SJohn Groves 111*d5406bd4SJohn Groves static const struct file_operations fsdev_fops = { 112*d5406bd4SJohn Groves .llseek = noop_llseek, 113*d5406bd4SJohn Groves .owner = THIS_MODULE, 114*d5406bd4SJohn Groves .open = fsdev_open, 115*d5406bd4SJohn Groves .release = fsdev_release, 116*d5406bd4SJohn Groves }; 117*d5406bd4SJohn Groves 118*d5406bd4SJohn Groves static int fsdev_dax_probe(struct dev_dax *dev_dax) 119*d5406bd4SJohn Groves { 120*d5406bd4SJohn Groves struct dax_device *dax_dev = dev_dax->dax_dev; 121*d5406bd4SJohn Groves struct device *dev = &dev_dax->dev; 122*d5406bd4SJohn Groves struct dev_pagemap *pgmap; 123*d5406bd4SJohn Groves struct inode *inode; 124*d5406bd4SJohn Groves struct cdev *cdev; 125*d5406bd4SJohn Groves void *addr; 126*d5406bd4SJohn Groves int rc, i; 127*d5406bd4SJohn Groves 128*d5406bd4SJohn Groves if (static_dev_dax(dev_dax)) { 129*d5406bd4SJohn Groves if (dev_dax->nr_range > 1) { 130*d5406bd4SJohn Groves dev_warn(dev, "static pgmap / multi-range device conflict\n"); 131*d5406bd4SJohn Groves return -EINVAL; 132*d5406bd4SJohn Groves } 133*d5406bd4SJohn Groves 134*d5406bd4SJohn Groves pgmap = dev_dax->pgmap; 135*d5406bd4SJohn Groves } else { 136*d5406bd4SJohn Groves size_t pgmap_size; 137*d5406bd4SJohn Groves 138*d5406bd4SJohn Groves if (dev_dax->pgmap) { 139*d5406bd4SJohn Groves dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 140*d5406bd4SJohn Groves return -EINVAL; 141*d5406bd4SJohn Groves } 142*d5406bd4SJohn Groves 143*d5406bd4SJohn Groves pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 144*d5406bd4SJohn Groves pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 145*d5406bd4SJohn Groves if (!pgmap) 146*d5406bd4SJohn Groves return -ENOMEM; 147*d5406bd4SJohn Groves 148*d5406bd4SJohn Groves pgmap->nr_range = dev_dax->nr_range; 149*d5406bd4SJohn Groves dev_dax->pgmap = pgmap; 150*d5406bd4SJohn Groves 151*d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 152*d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 153*d5406bd4SJohn Groves 154*d5406bd4SJohn Groves pgmap->ranges[i] = *range; 155*d5406bd4SJohn Groves } 156*d5406bd4SJohn Groves } 157*d5406bd4SJohn Groves 158*d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 159*d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 160*d5406bd4SJohn Groves 161*d5406bd4SJohn Groves if (!devm_request_mem_region(dev, range->start, 162*d5406bd4SJohn Groves range_len(range), dev_name(dev))) { 163*d5406bd4SJohn Groves dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 164*d5406bd4SJohn Groves i, range->start, range->end); 165*d5406bd4SJohn Groves return -EBUSY; 166*d5406bd4SJohn Groves } 167*d5406bd4SJohn Groves } 168*d5406bd4SJohn Groves 169*d5406bd4SJohn Groves /* 170*d5406bd4SJohn Groves * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 171*d5406bd4SJohn Groves * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 172*d5406bd4SJohn Groves * lets fs-dax dynamically build compound folios as needed, similar 173*d5406bd4SJohn Groves * to pmem behavior. 174*d5406bd4SJohn Groves */ 175*d5406bd4SJohn Groves pgmap->type = MEMORY_DEVICE_FS_DAX; 176*d5406bd4SJohn Groves pgmap->ops = &fsdev_pagemap_ops; 177*d5406bd4SJohn Groves pgmap->owner = dev_dax; 178*d5406bd4SJohn Groves 179*d5406bd4SJohn Groves addr = devm_memremap_pages(dev, pgmap); 180*d5406bd4SJohn Groves if (IS_ERR(addr)) 181*d5406bd4SJohn Groves return PTR_ERR(addr); 182*d5406bd4SJohn Groves 183*d5406bd4SJohn Groves /* 184*d5406bd4SJohn Groves * Clear any stale compound folio state left over from a previous 185*d5406bd4SJohn Groves * driver (e.g., device_dax with vmemmap_shift). Also register this 186*d5406bd4SJohn Groves * as a devm action so folio state is cleared on unbind, ensuring 187*d5406bd4SJohn Groves * clean pages for subsequent drivers (e.g., kmem for system-ram). 188*d5406bd4SJohn Groves */ 189*d5406bd4SJohn Groves fsdev_clear_folio_state(dev_dax); 190*d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 191*d5406bd4SJohn Groves dev_dax); 192*d5406bd4SJohn Groves if (rc) 193*d5406bd4SJohn Groves return rc; 194*d5406bd4SJohn Groves 195*d5406bd4SJohn Groves /* Detect whether the data is at a non-zero offset into the memory */ 196*d5406bd4SJohn Groves if (pgmap->range.start != dev_dax->ranges[0].range.start) { 197*d5406bd4SJohn Groves u64 phys = dev_dax->ranges[0].range.start; 198*d5406bd4SJohn Groves u64 pgmap_phys = dev_dax->pgmap[0].range.start; 199*d5406bd4SJohn Groves u64 data_offset = 0; 200*d5406bd4SJohn Groves 201*d5406bd4SJohn Groves if (!WARN_ON(pgmap_phys > phys)) 202*d5406bd4SJohn Groves data_offset = phys - pgmap_phys; 203*d5406bd4SJohn Groves 204*d5406bd4SJohn Groves pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 205*d5406bd4SJohn Groves __func__, phys, pgmap_phys, data_offset); 206*d5406bd4SJohn Groves } 207*d5406bd4SJohn Groves 208*d5406bd4SJohn Groves inode = dax_inode(dax_dev); 209*d5406bd4SJohn Groves cdev = inode->i_cdev; 210*d5406bd4SJohn Groves cdev_init(cdev, &fsdev_fops); 211*d5406bd4SJohn Groves cdev->owner = dev->driver->owner; 212*d5406bd4SJohn Groves cdev_set_parent(cdev, &dev->kobj); 213*d5406bd4SJohn Groves rc = cdev_add(cdev, dev->devt, 1); 214*d5406bd4SJohn Groves if (rc) 215*d5406bd4SJohn Groves return rc; 216*d5406bd4SJohn Groves 217*d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 218*d5406bd4SJohn Groves if (rc) 219*d5406bd4SJohn Groves return rc; 220*d5406bd4SJohn Groves 221*d5406bd4SJohn Groves run_dax(dax_dev); 222*d5406bd4SJohn Groves return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 223*d5406bd4SJohn Groves } 224*d5406bd4SJohn Groves 225*d5406bd4SJohn Groves static struct dax_device_driver fsdev_dax_driver = { 226*d5406bd4SJohn Groves .probe = fsdev_dax_probe, 227*d5406bd4SJohn Groves .type = DAXDRV_FSDEV_TYPE, 228*d5406bd4SJohn Groves }; 229*d5406bd4SJohn Groves 230*d5406bd4SJohn Groves static int __init dax_init(void) 231*d5406bd4SJohn Groves { 232*d5406bd4SJohn Groves return dax_driver_register(&fsdev_dax_driver); 233*d5406bd4SJohn Groves } 234*d5406bd4SJohn Groves 235*d5406bd4SJohn Groves static void __exit dax_exit(void) 236*d5406bd4SJohn Groves { 237*d5406bd4SJohn Groves dax_driver_unregister(&fsdev_dax_driver); 238*d5406bd4SJohn Groves } 239*d5406bd4SJohn Groves 240*d5406bd4SJohn Groves MODULE_AUTHOR("John Groves"); 241*d5406bd4SJohn Groves MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 242*d5406bd4SJohn Groves MODULE_LICENSE("GPL"); 243*d5406bd4SJohn Groves module_init(dax_init); 244*d5406bd4SJohn Groves module_exit(dax_exit); 245*d5406bd4SJohn Groves MODULE_ALIAS_DAX_DEVICE(0); 246