1d5406bd4SJohn Groves // SPDX-License-Identifier: GPL-2.0 2d5406bd4SJohn Groves /* Copyright(c) 2026 Micron Technology, Inc. */ 3d5406bd4SJohn Groves #include <linux/memremap.h> 4d5406bd4SJohn Groves #include <linux/pagemap.h> 5d5406bd4SJohn Groves #include <linux/module.h> 6d5406bd4SJohn Groves #include <linux/device.h> 7d5406bd4SJohn Groves #include <linux/cdev.h> 8d5406bd4SJohn Groves #include <linux/slab.h> 9d5406bd4SJohn Groves #include <linux/dax.h> 10d5406bd4SJohn Groves #include <linux/uio.h> 11d5406bd4SJohn Groves #include <linux/fs.h> 12d5406bd4SJohn Groves #include <linux/mm.h> 13d5406bd4SJohn Groves #include "dax-private.h" 14d5406bd4SJohn Groves #include "bus.h" 15d5406bd4SJohn Groves 16d5406bd4SJohn Groves /* 17d5406bd4SJohn Groves * FS-DAX compatible devdax driver 18d5406bd4SJohn Groves * 19d5406bd4SJohn Groves * Unlike drivers/dax/device.c which pre-initializes compound folios based 20d5406bd4SJohn Groves * on device alignment (via vmemmap_shift), this driver leaves folios 21d5406bd4SJohn Groves * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22d5406bd4SJohn Groves * to work without needing special handling for pre-initialized folios. 23d5406bd4SJohn Groves * 24d5406bd4SJohn Groves * Key differences from device.c: 25d5406bd4SJohn Groves * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26d5406bd4SJohn Groves * - vmemmap_shift is NOT set (folios remain order-0) 27d5406bd4SJohn Groves * - fs-dax can dynamically create compound folios as needed 28d5406bd4SJohn Groves * - No mmap support - all access is through fs-dax/iomap 29d5406bd4SJohn Groves */ 30d5406bd4SJohn Groves 31099c81a1SJohn Groves static void fsdev_write_dax(void *addr, struct page *page, 32099c81a1SJohn Groves unsigned int off, unsigned int len) 33099c81a1SJohn Groves { 34099c81a1SJohn Groves while (len) { 35099c81a1SJohn Groves void *mem = kmap_local_page(page); 36099c81a1SJohn Groves unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); 37099c81a1SJohn Groves 38099c81a1SJohn Groves memcpy_flushcache(addr, mem + off, chunk); 39099c81a1SJohn Groves kunmap_local(mem); 40099c81a1SJohn Groves len -= chunk; 41099c81a1SJohn Groves off = 0; 42099c81a1SJohn Groves page++; 43099c81a1SJohn Groves addr += chunk; 44099c81a1SJohn Groves } 45099c81a1SJohn Groves } 46099c81a1SJohn Groves 47099c81a1SJohn Groves static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 48099c81a1SJohn Groves long nr_pages, enum dax_access_mode mode, void **kaddr, 49099c81a1SJohn Groves unsigned long *pfn) 50099c81a1SJohn Groves { 51099c81a1SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 52099c81a1SJohn Groves size_t size = nr_pages << PAGE_SHIFT; 53099c81a1SJohn Groves size_t offset = pgoff << PAGE_SHIFT; 54099c81a1SJohn Groves void *virt_addr = dev_dax->virt_addr + offset; 55099c81a1SJohn Groves phys_addr_t phys; 56099c81a1SJohn Groves unsigned long local_pfn; 57099c81a1SJohn Groves 58099c81a1SJohn Groves phys = dax_pgoff_to_phys(dev_dax, pgoff, size); 59099c81a1SJohn Groves if (phys == -1) { 60099c81a1SJohn Groves dev_dbg(&dev_dax->dev, 61099c81a1SJohn Groves "pgoff (%#lx) out of range\n", pgoff); 62099c81a1SJohn Groves return -EFAULT; 63099c81a1SJohn Groves } 64099c81a1SJohn Groves 65099c81a1SJohn Groves if (kaddr) 66099c81a1SJohn Groves *kaddr = virt_addr; 67099c81a1SJohn Groves 68099c81a1SJohn Groves local_pfn = PHYS_PFN(phys); 69099c81a1SJohn Groves if (pfn) 70099c81a1SJohn Groves *pfn = local_pfn; 71099c81a1SJohn Groves 72099c81a1SJohn Groves /* 73099c81a1SJohn Groves * Use cached_size which was computed at probe time. The size cannot 74099c81a1SJohn Groves * change while the driver is bound (resize returns -EBUSY). 75099c81a1SJohn Groves */ 76099c81a1SJohn Groves return PHYS_PFN(min(size, dev_dax->cached_size - offset)); 77099c81a1SJohn Groves } 78099c81a1SJohn Groves 79099c81a1SJohn Groves static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, 80099c81a1SJohn Groves pgoff_t pgoff, size_t nr_pages) 81099c81a1SJohn Groves { 82099c81a1SJohn Groves void *kaddr; 83*45df9111SJohn Groves long rc; 84099c81a1SJohn Groves 85099c81a1SJohn Groves WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); 86*45df9111SJohn Groves rc = __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); 87*45df9111SJohn Groves if (rc < 0) 88*45df9111SJohn Groves return rc; 89099c81a1SJohn Groves fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); 90099c81a1SJohn Groves return 0; 91099c81a1SJohn Groves } 92099c81a1SJohn Groves 93099c81a1SJohn Groves static long fsdev_dax_direct_access(struct dax_device *dax_dev, 94099c81a1SJohn Groves pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, 95099c81a1SJohn Groves void **kaddr, unsigned long *pfn) 96099c81a1SJohn Groves { 97099c81a1SJohn Groves return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, 98099c81a1SJohn Groves kaddr, pfn); 99099c81a1SJohn Groves } 100099c81a1SJohn Groves 101099c81a1SJohn Groves static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 102099c81a1SJohn Groves void *addr, size_t bytes, struct iov_iter *i) 103099c81a1SJohn Groves { 104099c81a1SJohn Groves return _copy_from_iter_flushcache(addr, bytes, i); 105099c81a1SJohn Groves } 106099c81a1SJohn Groves 107099c81a1SJohn Groves static const struct dax_operations dev_dax_ops = { 108099c81a1SJohn Groves .direct_access = fsdev_dax_direct_access, 109099c81a1SJohn Groves .zero_page_range = fsdev_dax_zero_page_range, 110099c81a1SJohn Groves .recovery_write = fsdev_dax_recovery_write, 111099c81a1SJohn Groves }; 112099c81a1SJohn Groves 113d5406bd4SJohn Groves static void fsdev_cdev_del(void *cdev) 114d5406bd4SJohn Groves { 115d5406bd4SJohn Groves cdev_del(cdev); 116d5406bd4SJohn Groves } 117d5406bd4SJohn Groves 118d5406bd4SJohn Groves static void fsdev_kill(void *dev_dax) 119d5406bd4SJohn Groves { 120d5406bd4SJohn Groves kill_dev_dax(dev_dax); 121d5406bd4SJohn Groves } 122d5406bd4SJohn Groves 123700ecbc1SJohn Groves static void fsdev_clear_ops(void *data) 124700ecbc1SJohn Groves { 125700ecbc1SJohn Groves struct dev_dax *dev_dax = data; 126700ecbc1SJohn Groves 127700ecbc1SJohn Groves dax_set_ops(dev_dax->dax_dev, NULL); 128700ecbc1SJohn Groves } 129700ecbc1SJohn Groves 130d5406bd4SJohn Groves /* 131d5406bd4SJohn Groves * Page map operations for FS-DAX mode 132d5406bd4SJohn Groves * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 133d5406bd4SJohn Groves * 134d5406bd4SJohn Groves * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 135d5406bd4SJohn Groves * The core mm code in free_zone_device_folio() handles the wake_up_var() 136d5406bd4SJohn Groves * directly for this memory type. 137d5406bd4SJohn Groves */ 138d5406bd4SJohn Groves static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 139d5406bd4SJohn Groves unsigned long pfn, unsigned long nr_pages, int mf_flags) 140d5406bd4SJohn Groves { 141d5406bd4SJohn Groves struct dev_dax *dev_dax = pgmap->owner; 142d5406bd4SJohn Groves u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 143d5406bd4SJohn Groves u64 len = nr_pages << PAGE_SHIFT; 144d5406bd4SJohn Groves 145d5406bd4SJohn Groves return dax_holder_notify_failure(dev_dax->dax_dev, offset, 146d5406bd4SJohn Groves len, mf_flags); 147d5406bd4SJohn Groves } 148d5406bd4SJohn Groves 149d5406bd4SJohn Groves static const struct dev_pagemap_ops fsdev_pagemap_ops = { 150d5406bd4SJohn Groves .memory_failure = fsdev_pagemap_memory_failure, 151d5406bd4SJohn Groves }; 152d5406bd4SJohn Groves 153d5406bd4SJohn Groves /* 154d5406bd4SJohn Groves * Clear any stale folio state from pages in the given range. 155d5406bd4SJohn Groves * This is necessary because device_dax pre-initializes compound folios 156d5406bd4SJohn Groves * based on vmemmap_shift, and that state may persist after driver unbind. 157d5406bd4SJohn Groves * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 158d5406bd4SJohn Groves * expects to find clean order-0 folios that it can build into compound 159d5406bd4SJohn Groves * folios on demand. 160d5406bd4SJohn Groves * 161d5406bd4SJohn Groves * At probe time, no filesystem should be mounted yet, so all mappings 162d5406bd4SJohn Groves * are stale and must be cleared along with compound state. 163d5406bd4SJohn Groves */ 164d5406bd4SJohn Groves static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 165d5406bd4SJohn Groves { 166d5406bd4SJohn Groves for (int i = 0; i < dev_dax->nr_range; i++) { 167d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 168d5406bd4SJohn Groves unsigned long pfn = PHYS_PFN(range->start); 169d5406bd4SJohn Groves unsigned long end_pfn = PHYS_PFN(range->end) + 1; 170d5406bd4SJohn Groves 171d5406bd4SJohn Groves while (pfn < end_pfn) { 172d5406bd4SJohn Groves struct folio *folio = pfn_folio(pfn); 173d5406bd4SJohn Groves int order = dax_folio_reset_order(folio); 174d5406bd4SJohn Groves 175d5406bd4SJohn Groves pfn += 1UL << order; 176d5406bd4SJohn Groves } 177d5406bd4SJohn Groves } 178d5406bd4SJohn Groves } 179d5406bd4SJohn Groves 180d5406bd4SJohn Groves static void fsdev_clear_folio_state_action(void *data) 181d5406bd4SJohn Groves { 182d5406bd4SJohn Groves fsdev_clear_folio_state(data); 183d5406bd4SJohn Groves } 184d5406bd4SJohn Groves 185d5406bd4SJohn Groves static int fsdev_open(struct inode *inode, struct file *filp) 186d5406bd4SJohn Groves { 187d5406bd4SJohn Groves struct dax_device *dax_dev = inode_dax(inode); 188d5406bd4SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 189d5406bd4SJohn Groves 190d5406bd4SJohn Groves filp->private_data = dev_dax; 191d5406bd4SJohn Groves 192d5406bd4SJohn Groves return 0; 193d5406bd4SJohn Groves } 194d5406bd4SJohn Groves 195d5406bd4SJohn Groves static int fsdev_release(struct inode *inode, struct file *filp) 196d5406bd4SJohn Groves { 197d5406bd4SJohn Groves return 0; 198d5406bd4SJohn Groves } 199d5406bd4SJohn Groves 200d5406bd4SJohn Groves static const struct file_operations fsdev_fops = { 201d5406bd4SJohn Groves .llseek = noop_llseek, 202d5406bd4SJohn Groves .owner = THIS_MODULE, 203d5406bd4SJohn Groves .open = fsdev_open, 204d5406bd4SJohn Groves .release = fsdev_release, 205d5406bd4SJohn Groves }; 206d5406bd4SJohn Groves 207d5406bd4SJohn Groves static int fsdev_dax_probe(struct dev_dax *dev_dax) 208d5406bd4SJohn Groves { 209d5406bd4SJohn Groves struct dax_device *dax_dev = dev_dax->dax_dev; 210d5406bd4SJohn Groves struct device *dev = &dev_dax->dev; 211d5406bd4SJohn Groves struct dev_pagemap *pgmap; 212d5406bd4SJohn Groves struct inode *inode; 21375945584SJohn Groves u64 data_offset = 0; 214d5406bd4SJohn Groves struct cdev *cdev; 215d5406bd4SJohn Groves void *addr; 216d5406bd4SJohn Groves int rc, i; 217d5406bd4SJohn Groves 218d5406bd4SJohn Groves if (static_dev_dax(dev_dax)) { 219d5406bd4SJohn Groves if (dev_dax->nr_range > 1) { 220d5406bd4SJohn Groves dev_warn(dev, "static pgmap / multi-range device conflict\n"); 221d5406bd4SJohn Groves return -EINVAL; 222d5406bd4SJohn Groves } 223d5406bd4SJohn Groves 224d5406bd4SJohn Groves pgmap = dev_dax->pgmap; 225d5406bd4SJohn Groves } else { 226d5406bd4SJohn Groves size_t pgmap_size; 227d5406bd4SJohn Groves 228d5406bd4SJohn Groves if (dev_dax->pgmap) { 229d5406bd4SJohn Groves dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 230d5406bd4SJohn Groves return -EINVAL; 231d5406bd4SJohn Groves } 232d5406bd4SJohn Groves 233d5406bd4SJohn Groves pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 234d5406bd4SJohn Groves pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 235d5406bd4SJohn Groves if (!pgmap) 236d5406bd4SJohn Groves return -ENOMEM; 237d5406bd4SJohn Groves 238d5406bd4SJohn Groves pgmap->nr_range = dev_dax->nr_range; 239d5406bd4SJohn Groves dev_dax->pgmap = pgmap; 240d5406bd4SJohn Groves 241d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 242d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 243d5406bd4SJohn Groves 244d5406bd4SJohn Groves pgmap->ranges[i] = *range; 245d5406bd4SJohn Groves } 246d5406bd4SJohn Groves } 247d5406bd4SJohn Groves 248d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 249d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 250d5406bd4SJohn Groves 251d5406bd4SJohn Groves if (!devm_request_mem_region(dev, range->start, 252d5406bd4SJohn Groves range_len(range), dev_name(dev))) { 253d5406bd4SJohn Groves dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 254d5406bd4SJohn Groves i, range->start, range->end); 255d5406bd4SJohn Groves return -EBUSY; 256d5406bd4SJohn Groves } 257d5406bd4SJohn Groves } 258d5406bd4SJohn Groves 259099c81a1SJohn Groves /* Cache size now; it cannot change while driver is bound */ 260099c81a1SJohn Groves dev_dax->cached_size = 0; 261099c81a1SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) 262099c81a1SJohn Groves dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); 263099c81a1SJohn Groves 264d5406bd4SJohn Groves /* 265d5406bd4SJohn Groves * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 266d5406bd4SJohn Groves * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 267d5406bd4SJohn Groves * lets fs-dax dynamically build compound folios as needed, similar 268d5406bd4SJohn Groves * to pmem behavior. 269d5406bd4SJohn Groves */ 270d5406bd4SJohn Groves pgmap->type = MEMORY_DEVICE_FS_DAX; 271d5406bd4SJohn Groves pgmap->ops = &fsdev_pagemap_ops; 272d5406bd4SJohn Groves pgmap->owner = dev_dax; 273d5406bd4SJohn Groves 274d5406bd4SJohn Groves addr = devm_memremap_pages(dev, pgmap); 275d5406bd4SJohn Groves if (IS_ERR(addr)) 276d5406bd4SJohn Groves return PTR_ERR(addr); 277d5406bd4SJohn Groves 278d5406bd4SJohn Groves /* 279d5406bd4SJohn Groves * Clear any stale compound folio state left over from a previous 280d5406bd4SJohn Groves * driver (e.g., device_dax with vmemmap_shift). Also register this 281d5406bd4SJohn Groves * as a devm action so folio state is cleared on unbind, ensuring 282d5406bd4SJohn Groves * clean pages for subsequent drivers (e.g., kmem for system-ram). 283d5406bd4SJohn Groves */ 284d5406bd4SJohn Groves fsdev_clear_folio_state(dev_dax); 285d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 286d5406bd4SJohn Groves dev_dax); 287d5406bd4SJohn Groves if (rc) 288d5406bd4SJohn Groves return rc; 289d5406bd4SJohn Groves 290d5406bd4SJohn Groves /* Detect whether the data is at a non-zero offset into the memory */ 291d5406bd4SJohn Groves if (pgmap->range.start != dev_dax->ranges[0].range.start) { 292d5406bd4SJohn Groves u64 phys = dev_dax->ranges[0].range.start; 293d5406bd4SJohn Groves u64 pgmap_phys = dev_dax->pgmap[0].range.start; 294d5406bd4SJohn Groves 295d5406bd4SJohn Groves if (!WARN_ON(pgmap_phys > phys)) 296d5406bd4SJohn Groves data_offset = phys - pgmap_phys; 297d5406bd4SJohn Groves 298d5406bd4SJohn Groves pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 299d5406bd4SJohn Groves __func__, phys, pgmap_phys, data_offset); 300d5406bd4SJohn Groves } 30175945584SJohn Groves dev_dax->virt_addr = addr + data_offset; 302d5406bd4SJohn Groves 303d5406bd4SJohn Groves inode = dax_inode(dax_dev); 304d5406bd4SJohn Groves cdev = inode->i_cdev; 305d5406bd4SJohn Groves cdev_init(cdev, &fsdev_fops); 306d5406bd4SJohn Groves cdev->owner = dev->driver->owner; 307d5406bd4SJohn Groves cdev_set_parent(cdev, &dev->kobj); 308d5406bd4SJohn Groves rc = cdev_add(cdev, dev->devt, 1); 309d5406bd4SJohn Groves if (rc) 310d5406bd4SJohn Groves return rc; 311d5406bd4SJohn Groves 312d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 313d5406bd4SJohn Groves if (rc) 314d5406bd4SJohn Groves return rc; 315d5406bd4SJohn Groves 316700ecbc1SJohn Groves /* Set the dax operations for fs-dax access path */ 317700ecbc1SJohn Groves rc = dax_set_ops(dax_dev, &dev_dax_ops); 318700ecbc1SJohn Groves if (rc) 319700ecbc1SJohn Groves return rc; 320700ecbc1SJohn Groves 321700ecbc1SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); 322700ecbc1SJohn Groves if (rc) 323700ecbc1SJohn Groves return rc; 324700ecbc1SJohn Groves 325d5406bd4SJohn Groves run_dax(dax_dev); 326d5406bd4SJohn Groves return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 327d5406bd4SJohn Groves } 328d5406bd4SJohn Groves 329d5406bd4SJohn Groves static struct dax_device_driver fsdev_dax_driver = { 330d5406bd4SJohn Groves .probe = fsdev_dax_probe, 331d5406bd4SJohn Groves .type = DAXDRV_FSDEV_TYPE, 332d5406bd4SJohn Groves }; 333d5406bd4SJohn Groves 334d5406bd4SJohn Groves static int __init dax_init(void) 335d5406bd4SJohn Groves { 336d5406bd4SJohn Groves return dax_driver_register(&fsdev_dax_driver); 337d5406bd4SJohn Groves } 338d5406bd4SJohn Groves 339d5406bd4SJohn Groves static void __exit dax_exit(void) 340d5406bd4SJohn Groves { 341d5406bd4SJohn Groves dax_driver_unregister(&fsdev_dax_driver); 342d5406bd4SJohn Groves } 343d5406bd4SJohn Groves 344d5406bd4SJohn Groves MODULE_AUTHOR("John Groves"); 345d5406bd4SJohn Groves MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 346d5406bd4SJohn Groves MODULE_LICENSE("GPL"); 347d5406bd4SJohn Groves module_init(dax_init); 348d5406bd4SJohn Groves module_exit(dax_exit); 349d5406bd4SJohn Groves MODULE_ALIAS_DAX_DEVICE(0); 350