1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2026 Micron Technology, Inc. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/module.h> 6 #include <linux/device.h> 7 #include <linux/cdev.h> 8 #include <linux/slab.h> 9 #include <linux/dax.h> 10 #include <linux/uio.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include "dax-private.h" 14 #include "bus.h" 15 16 /* 17 * FS-DAX compatible devdax driver 18 * 19 * Unlike drivers/dax/device.c which pre-initializes compound folios based 20 * on device alignment (via vmemmap_shift), this driver leaves folios 21 * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22 * to work without needing special handling for pre-initialized folios. 23 * 24 * Key differences from device.c: 25 * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26 * - vmemmap_shift is NOT set (folios remain order-0) 27 * - fs-dax can dynamically create compound folios as needed 28 * - No mmap support - all access is through fs-dax/iomap 29 */ 30 31 static void fsdev_write_dax(void *addr, struct page *page, 32 unsigned int off, unsigned int len) 33 { 34 while (len) { 35 void *mem = kmap_local_page(page); 36 unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); 37 38 memcpy_flushcache(addr, mem + off, chunk); 39 kunmap_local(mem); 40 len -= chunk; 41 off = 0; 42 page++; 43 addr += chunk; 44 } 45 } 46 47 static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 48 long nr_pages, enum dax_access_mode mode, void **kaddr, 49 unsigned long *pfn) 50 { 51 struct dev_dax *dev_dax = dax_get_private(dax_dev); 52 size_t size = nr_pages << PAGE_SHIFT; 53 size_t offset = pgoff << PAGE_SHIFT; 54 void *virt_addr = dev_dax->virt_addr + offset; 55 phys_addr_t phys; 56 unsigned long local_pfn; 57 58 phys = dax_pgoff_to_phys(dev_dax, pgoff, size); 59 if (phys == -1) { 60 dev_dbg(&dev_dax->dev, 61 "pgoff (%#lx) out of range\n", pgoff); 62 return -EFAULT; 63 } 64 65 if (kaddr) 66 *kaddr = virt_addr; 67 68 local_pfn = PHYS_PFN(phys); 69 if (pfn) 70 *pfn = local_pfn; 71 72 /* 73 * Use cached_size which was computed at probe time. The size cannot 74 * change while the driver is bound (resize returns -EBUSY). 75 */ 76 return PHYS_PFN(min(size, dev_dax->cached_size - offset)); 77 } 78 79 static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, 80 pgoff_t pgoff, size_t nr_pages) 81 { 82 void *kaddr; 83 long rc; 84 85 WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); 86 rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); 87 if (rc < 0) 88 return rc; 89 fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); 90 return 0; 91 } 92 93 static long fsdev_dax_direct_access(struct dax_device *dax_dev, 94 pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, 95 void **kaddr, unsigned long *pfn) 96 { 97 return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, 98 kaddr, pfn); 99 } 100 101 static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 102 void *addr, size_t bytes, struct iov_iter *i) 103 { 104 return _copy_from_iter_flushcache(addr, bytes, i); 105 } 106 107 static const struct dax_operations dev_dax_ops = { 108 .direct_access = fsdev_dax_direct_access, 109 .zero_page_range = fsdev_dax_zero_page_range, 110 .recovery_write = fsdev_dax_recovery_write, 111 }; 112 113 static void fsdev_cdev_del(void *cdev) 114 { 115 cdev_del(cdev); 116 } 117 118 static void fsdev_kill(void *dev_dax) 119 { 120 kill_dev_dax(dev_dax); 121 } 122 123 static void fsdev_clear_ops(void *data) 124 { 125 struct dev_dax *dev_dax = data; 126 127 dax_set_ops(dev_dax->dax_dev, NULL); 128 } 129 130 /* 131 * Page map operations for FS-DAX mode 132 * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 133 * 134 * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 135 * The core mm code in free_zone_device_folio() handles the wake_up_var() 136 * directly for this memory type. 137 */ 138 static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 139 unsigned long pfn, unsigned long nr_pages, int mf_flags) 140 { 141 struct dev_dax *dev_dax = pgmap->owner; 142 u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 143 u64 len = nr_pages << PAGE_SHIFT; 144 145 return dax_holder_notify_failure(dev_dax->dax_dev, offset, 146 len, mf_flags); 147 } 148 149 static const struct dev_pagemap_ops fsdev_pagemap_ops = { 150 .memory_failure = fsdev_pagemap_memory_failure, 151 }; 152 153 /* 154 * Clear any stale folio state from pages in the given range. 155 * This is necessary because device_dax pre-initializes compound folios 156 * based on vmemmap_shift, and that state may persist after driver unbind. 157 * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 158 * expects to find clean order-0 folios that it can build into compound 159 * folios on demand. 160 * 161 * At probe time, no filesystem should be mounted yet, so all mappings 162 * are stale and must be cleared along with compound state. 163 */ 164 static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 165 { 166 for (int i = 0; i < dev_dax->nr_range; i++) { 167 struct range *range = &dev_dax->ranges[i].range; 168 unsigned long pfn = PHYS_PFN(range->start); 169 unsigned long end_pfn = PHYS_PFN(range->end) + 1; 170 171 while (pfn < end_pfn) { 172 struct folio *folio = pfn_folio(pfn); 173 int order = dax_folio_reset_order(folio); 174 175 pfn += 1UL << order; 176 } 177 } 178 } 179 180 static void fsdev_clear_folio_state_action(void *data) 181 { 182 fsdev_clear_folio_state(data); 183 } 184 185 static int fsdev_open(struct inode *inode, struct file *filp) 186 { 187 struct dax_device *dax_dev = inode_dax(inode); 188 struct dev_dax *dev_dax = dax_get_private(dax_dev); 189 190 filp->private_data = dev_dax; 191 192 return 0; 193 } 194 195 static int fsdev_release(struct inode *inode, struct file *filp) 196 { 197 return 0; 198 } 199 200 static const struct file_operations fsdev_fops = { 201 .llseek = noop_llseek, 202 .owner = THIS_MODULE, 203 .open = fsdev_open, 204 .release = fsdev_release, 205 }; 206 207 static int fsdev_dax_probe(struct dev_dax *dev_dax) 208 { 209 struct dax_device *dax_dev = dev_dax->dax_dev; 210 struct device *dev = &dev_dax->dev; 211 struct dev_pagemap *pgmap; 212 struct inode *inode; 213 u64 data_offset = 0; 214 struct cdev *cdev; 215 void *addr; 216 int rc, i; 217 218 if (static_dev_dax(dev_dax)) { 219 if (dev_dax->nr_range > 1) { 220 dev_warn(dev, "static pgmap / multi-range device conflict\n"); 221 return -EINVAL; 222 } 223 224 pgmap = dev_dax->pgmap; 225 } else { 226 size_t pgmap_size; 227 228 if (dev_dax->pgmap) { 229 dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 230 return -EINVAL; 231 } 232 233 pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 234 pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 235 if (!pgmap) 236 return -ENOMEM; 237 238 pgmap->nr_range = dev_dax->nr_range; 239 dev_dax->pgmap = pgmap; 240 241 for (i = 0; i < dev_dax->nr_range; i++) { 242 struct range *range = &dev_dax->ranges[i].range; 243 244 pgmap->ranges[i] = *range; 245 } 246 } 247 248 for (i = 0; i < dev_dax->nr_range; i++) { 249 struct range *range = &dev_dax->ranges[i].range; 250 251 if (!devm_request_mem_region(dev, range->start, 252 range_len(range), dev_name(dev))) { 253 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 254 i, range->start, range->end); 255 return -EBUSY; 256 } 257 } 258 259 /* Cache size now; it cannot change while driver is bound */ 260 dev_dax->cached_size = 0; 261 for (i = 0; i < dev_dax->nr_range; i++) 262 dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); 263 264 /* 265 * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 266 * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 267 * lets fs-dax dynamically build compound folios as needed, similar 268 * to pmem behavior. 269 */ 270 pgmap->type = MEMORY_DEVICE_FS_DAX; 271 pgmap->ops = &fsdev_pagemap_ops; 272 pgmap->owner = dev_dax; 273 274 addr = devm_memremap_pages(dev, pgmap); 275 if (IS_ERR(addr)) 276 return PTR_ERR(addr); 277 278 /* 279 * Clear any stale compound folio state left over from a previous 280 * driver (e.g., device_dax with vmemmap_shift). Also register this 281 * as a devm action so folio state is cleared on unbind, ensuring 282 * clean pages for subsequent drivers (e.g., kmem for system-ram). 283 */ 284 fsdev_clear_folio_state(dev_dax); 285 rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 286 dev_dax); 287 if (rc) 288 return rc; 289 290 /* Detect whether the data is at a non-zero offset into the memory */ 291 if (pgmap->range.start != dev_dax->ranges[0].range.start) { 292 u64 phys = dev_dax->ranges[0].range.start; 293 u64 pgmap_phys = dev_dax->pgmap[0].range.start; 294 295 if (!WARN_ON(pgmap_phys > phys)) 296 data_offset = phys - pgmap_phys; 297 298 pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 299 __func__, phys, pgmap_phys, data_offset); 300 } 301 dev_dax->virt_addr = addr + data_offset; 302 303 inode = dax_inode(dax_dev); 304 cdev = inode->i_cdev; 305 cdev_init(cdev, &fsdev_fops); 306 cdev->owner = dev->driver->owner; 307 cdev_set_parent(cdev, &dev->kobj); 308 rc = cdev_add(cdev, dev->devt, 1); 309 if (rc) 310 return rc; 311 312 rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 313 if (rc) 314 return rc; 315 316 /* Set the dax operations for fs-dax access path */ 317 rc = dax_set_ops(dax_dev, &dev_dax_ops); 318 if (rc) 319 return rc; 320 321 rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); 322 if (rc) 323 return rc; 324 325 run_dax(dax_dev); 326 return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 327 } 328 329 static struct dax_device_driver fsdev_dax_driver = { 330 .probe = fsdev_dax_probe, 331 .type = DAXDRV_FSDEV_TYPE, 332 }; 333 334 static int __init dax_init(void) 335 { 336 return dax_driver_register(&fsdev_dax_driver); 337 } 338 339 static void __exit dax_exit(void) 340 { 341 dax_driver_unregister(&fsdev_dax_driver); 342 } 343 344 MODULE_AUTHOR("John Groves"); 345 MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 346 MODULE_LICENSE("GPL"); 347 module_init(dax_init); 348 module_exit(dax_exit); 349 MODULE_ALIAS_DAX_DEVICE(0); 350