1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2026 Micron Technology, Inc. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/module.h> 6 #include <linux/device.h> 7 #include <linux/cdev.h> 8 #include <linux/slab.h> 9 #include <linux/dax.h> 10 #include <linux/uio.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include "dax-private.h" 14 #include "bus.h" 15 16 /* 17 * FS-DAX compatible devdax driver 18 * 19 * Unlike drivers/dax/device.c which pre-initializes compound folios based 20 * on device alignment (via vmemmap_shift), this driver leaves folios 21 * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22 * to work without needing special handling for pre-initialized folios. 23 * 24 * Key differences from device.c: 25 * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26 * - vmemmap_shift is NOT set (folios remain order-0) 27 * - fs-dax can dynamically create compound folios as needed 28 * - No mmap support - all access is through fs-dax/iomap 29 */ 30 31 static void fsdev_cdev_del(void *cdev) 32 { 33 cdev_del(cdev); 34 } 35 36 static void fsdev_kill(void *dev_dax) 37 { 38 kill_dev_dax(dev_dax); 39 } 40 41 /* 42 * Page map operations for FS-DAX mode 43 * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 44 * 45 * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 46 * The core mm code in free_zone_device_folio() handles the wake_up_var() 47 * directly for this memory type. 48 */ 49 static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 50 unsigned long pfn, unsigned long nr_pages, int mf_flags) 51 { 52 struct dev_dax *dev_dax = pgmap->owner; 53 u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 54 u64 len = nr_pages << PAGE_SHIFT; 55 56 return dax_holder_notify_failure(dev_dax->dax_dev, offset, 57 len, mf_flags); 58 } 59 60 static const struct dev_pagemap_ops fsdev_pagemap_ops = { 61 .memory_failure = fsdev_pagemap_memory_failure, 62 }; 63 64 /* 65 * Clear any stale folio state from pages in the given range. 66 * This is necessary because device_dax pre-initializes compound folios 67 * based on vmemmap_shift, and that state may persist after driver unbind. 68 * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 69 * expects to find clean order-0 folios that it can build into compound 70 * folios on demand. 71 * 72 * At probe time, no filesystem should be mounted yet, so all mappings 73 * are stale and must be cleared along with compound state. 74 */ 75 static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 76 { 77 for (int i = 0; i < dev_dax->nr_range; i++) { 78 struct range *range = &dev_dax->ranges[i].range; 79 unsigned long pfn = PHYS_PFN(range->start); 80 unsigned long end_pfn = PHYS_PFN(range->end) + 1; 81 82 while (pfn < end_pfn) { 83 struct folio *folio = pfn_folio(pfn); 84 int order = dax_folio_reset_order(folio); 85 86 pfn += 1UL << order; 87 } 88 } 89 } 90 91 static void fsdev_clear_folio_state_action(void *data) 92 { 93 fsdev_clear_folio_state(data); 94 } 95 96 static int fsdev_open(struct inode *inode, struct file *filp) 97 { 98 struct dax_device *dax_dev = inode_dax(inode); 99 struct dev_dax *dev_dax = dax_get_private(dax_dev); 100 101 filp->private_data = dev_dax; 102 103 return 0; 104 } 105 106 static int fsdev_release(struct inode *inode, struct file *filp) 107 { 108 return 0; 109 } 110 111 static const struct file_operations fsdev_fops = { 112 .llseek = noop_llseek, 113 .owner = THIS_MODULE, 114 .open = fsdev_open, 115 .release = fsdev_release, 116 }; 117 118 static int fsdev_dax_probe(struct dev_dax *dev_dax) 119 { 120 struct dax_device *dax_dev = dev_dax->dax_dev; 121 struct device *dev = &dev_dax->dev; 122 struct dev_pagemap *pgmap; 123 struct inode *inode; 124 u64 data_offset = 0; 125 struct cdev *cdev; 126 void *addr; 127 int rc, i; 128 129 if (static_dev_dax(dev_dax)) { 130 if (dev_dax->nr_range > 1) { 131 dev_warn(dev, "static pgmap / multi-range device conflict\n"); 132 return -EINVAL; 133 } 134 135 pgmap = dev_dax->pgmap; 136 } else { 137 size_t pgmap_size; 138 139 if (dev_dax->pgmap) { 140 dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 141 return -EINVAL; 142 } 143 144 pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 145 pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 146 if (!pgmap) 147 return -ENOMEM; 148 149 pgmap->nr_range = dev_dax->nr_range; 150 dev_dax->pgmap = pgmap; 151 152 for (i = 0; i < dev_dax->nr_range; i++) { 153 struct range *range = &dev_dax->ranges[i].range; 154 155 pgmap->ranges[i] = *range; 156 } 157 } 158 159 for (i = 0; i < dev_dax->nr_range; i++) { 160 struct range *range = &dev_dax->ranges[i].range; 161 162 if (!devm_request_mem_region(dev, range->start, 163 range_len(range), dev_name(dev))) { 164 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 165 i, range->start, range->end); 166 return -EBUSY; 167 } 168 } 169 170 /* 171 * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 172 * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 173 * lets fs-dax dynamically build compound folios as needed, similar 174 * to pmem behavior. 175 */ 176 pgmap->type = MEMORY_DEVICE_FS_DAX; 177 pgmap->ops = &fsdev_pagemap_ops; 178 pgmap->owner = dev_dax; 179 180 addr = devm_memremap_pages(dev, pgmap); 181 if (IS_ERR(addr)) 182 return PTR_ERR(addr); 183 184 /* 185 * Clear any stale compound folio state left over from a previous 186 * driver (e.g., device_dax with vmemmap_shift). Also register this 187 * as a devm action so folio state is cleared on unbind, ensuring 188 * clean pages for subsequent drivers (e.g., kmem for system-ram). 189 */ 190 fsdev_clear_folio_state(dev_dax); 191 rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 192 dev_dax); 193 if (rc) 194 return rc; 195 196 /* Detect whether the data is at a non-zero offset into the memory */ 197 if (pgmap->range.start != dev_dax->ranges[0].range.start) { 198 u64 phys = dev_dax->ranges[0].range.start; 199 u64 pgmap_phys = dev_dax->pgmap[0].range.start; 200 201 if (!WARN_ON(pgmap_phys > phys)) 202 data_offset = phys - pgmap_phys; 203 204 pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 205 __func__, phys, pgmap_phys, data_offset); 206 } 207 dev_dax->virt_addr = addr + data_offset; 208 209 inode = dax_inode(dax_dev); 210 cdev = inode->i_cdev; 211 cdev_init(cdev, &fsdev_fops); 212 cdev->owner = dev->driver->owner; 213 cdev_set_parent(cdev, &dev->kobj); 214 rc = cdev_add(cdev, dev->devt, 1); 215 if (rc) 216 return rc; 217 218 rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 219 if (rc) 220 return rc; 221 222 run_dax(dax_dev); 223 return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 224 } 225 226 static struct dax_device_driver fsdev_dax_driver = { 227 .probe = fsdev_dax_probe, 228 .type = DAXDRV_FSDEV_TYPE, 229 }; 230 231 static int __init dax_init(void) 232 { 233 return dax_driver_register(&fsdev_dax_driver); 234 } 235 236 static void __exit dax_exit(void) 237 { 238 dax_driver_unregister(&fsdev_dax_driver); 239 } 240 241 MODULE_AUTHOR("John Groves"); 242 MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 243 MODULE_LICENSE("GPL"); 244 module_init(dax_init); 245 module_exit(dax_exit); 246 MODULE_ALIAS_DAX_DEVICE(0); 247