1d5406bd4SJohn Groves // SPDX-License-Identifier: GPL-2.0 2d5406bd4SJohn Groves /* Copyright(c) 2026 Micron Technology, Inc. */ 3d5406bd4SJohn Groves #include <linux/memremap.h> 4d5406bd4SJohn Groves #include <linux/pagemap.h> 5d5406bd4SJohn Groves #include <linux/module.h> 6d5406bd4SJohn Groves #include <linux/device.h> 7d5406bd4SJohn Groves #include <linux/cdev.h> 8d5406bd4SJohn Groves #include <linux/slab.h> 9d5406bd4SJohn Groves #include <linux/dax.h> 10d5406bd4SJohn Groves #include <linux/uio.h> 11d5406bd4SJohn Groves #include <linux/fs.h> 12d5406bd4SJohn Groves #include <linux/mm.h> 13d5406bd4SJohn Groves #include "dax-private.h" 14d5406bd4SJohn Groves #include "bus.h" 15d5406bd4SJohn Groves 16d5406bd4SJohn Groves /* 17d5406bd4SJohn Groves * FS-DAX compatible devdax driver 18d5406bd4SJohn Groves * 19d5406bd4SJohn Groves * Unlike drivers/dax/device.c which pre-initializes compound folios based 20d5406bd4SJohn Groves * on device alignment (via vmemmap_shift), this driver leaves folios 21d5406bd4SJohn Groves * uninitialized similar to pmem. This allows fs-dax filesystems like famfs 22d5406bd4SJohn Groves * to work without needing special handling for pre-initialized folios. 23d5406bd4SJohn Groves * 24d5406bd4SJohn Groves * Key differences from device.c: 25d5406bd4SJohn Groves * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) 26d5406bd4SJohn Groves * - vmemmap_shift is NOT set (folios remain order-0) 27d5406bd4SJohn Groves * - fs-dax can dynamically create compound folios as needed 28d5406bd4SJohn Groves * - No mmap support - all access is through fs-dax/iomap 29d5406bd4SJohn Groves */ 30d5406bd4SJohn Groves 31099c81a1SJohn Groves static void fsdev_write_dax(void *addr, struct page *page, 32099c81a1SJohn Groves unsigned int off, unsigned int len) 33099c81a1SJohn Groves { 34099c81a1SJohn Groves while (len) { 35099c81a1SJohn Groves void *mem = kmap_local_page(page); 36099c81a1SJohn Groves unsigned int chunk = min_t(unsigned int, len, PAGE_SIZE - off); 37099c81a1SJohn Groves 38099c81a1SJohn Groves memcpy_flushcache(addr, mem + off, chunk); 39099c81a1SJohn Groves kunmap_local(mem); 40099c81a1SJohn Groves len -= chunk; 41099c81a1SJohn Groves off = 0; 42099c81a1SJohn Groves page++; 43099c81a1SJohn Groves addr += chunk; 44099c81a1SJohn Groves } 45099c81a1SJohn Groves } 46099c81a1SJohn Groves 47099c81a1SJohn Groves static long __fsdev_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 48099c81a1SJohn Groves long nr_pages, enum dax_access_mode mode, void **kaddr, 49099c81a1SJohn Groves unsigned long *pfn) 50099c81a1SJohn Groves { 51099c81a1SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 52099c81a1SJohn Groves size_t size = nr_pages << PAGE_SHIFT; 53099c81a1SJohn Groves size_t offset = pgoff << PAGE_SHIFT; 54099c81a1SJohn Groves void *virt_addr = dev_dax->virt_addr + offset; 55099c81a1SJohn Groves phys_addr_t phys; 56099c81a1SJohn Groves unsigned long local_pfn; 57099c81a1SJohn Groves 58099c81a1SJohn Groves phys = dax_pgoff_to_phys(dev_dax, pgoff, size); 59099c81a1SJohn Groves if (phys == -1) { 60099c81a1SJohn Groves dev_dbg(&dev_dax->dev, 61099c81a1SJohn Groves "pgoff (%#lx) out of range\n", pgoff); 62099c81a1SJohn Groves return -EFAULT; 63099c81a1SJohn Groves } 64099c81a1SJohn Groves 65099c81a1SJohn Groves if (kaddr) 66099c81a1SJohn Groves *kaddr = virt_addr; 67099c81a1SJohn Groves 68099c81a1SJohn Groves local_pfn = PHYS_PFN(phys); 69099c81a1SJohn Groves if (pfn) 70099c81a1SJohn Groves *pfn = local_pfn; 71099c81a1SJohn Groves 72099c81a1SJohn Groves /* 73099c81a1SJohn Groves * Use cached_size which was computed at probe time. The size cannot 74099c81a1SJohn Groves * change while the driver is bound (resize returns -EBUSY). 75099c81a1SJohn Groves */ 76099c81a1SJohn Groves return PHYS_PFN(min(size, dev_dax->cached_size - offset)); 77099c81a1SJohn Groves } 78099c81a1SJohn Groves 79099c81a1SJohn Groves static int fsdev_dax_zero_page_range(struct dax_device *dax_dev, 80099c81a1SJohn Groves pgoff_t pgoff, size_t nr_pages) 81099c81a1SJohn Groves { 82099c81a1SJohn Groves void *kaddr; 83099c81a1SJohn Groves 84099c81a1SJohn Groves WARN_ONCE(nr_pages > 1, "%s: nr_pages > 1\n", __func__); 85099c81a1SJohn Groves __fsdev_dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); 86099c81a1SJohn Groves fsdev_write_dax(kaddr, ZERO_PAGE(0), 0, PAGE_SIZE); 87099c81a1SJohn Groves return 0; 88099c81a1SJohn Groves } 89099c81a1SJohn Groves 90099c81a1SJohn Groves static long fsdev_dax_direct_access(struct dax_device *dax_dev, 91099c81a1SJohn Groves pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, 92099c81a1SJohn Groves void **kaddr, unsigned long *pfn) 93099c81a1SJohn Groves { 94099c81a1SJohn Groves return __fsdev_dax_direct_access(dax_dev, pgoff, nr_pages, mode, 95099c81a1SJohn Groves kaddr, pfn); 96099c81a1SJohn Groves } 97099c81a1SJohn Groves 98099c81a1SJohn Groves static size_t fsdev_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 99099c81a1SJohn Groves void *addr, size_t bytes, struct iov_iter *i) 100099c81a1SJohn Groves { 101099c81a1SJohn Groves return _copy_from_iter_flushcache(addr, bytes, i); 102099c81a1SJohn Groves } 103099c81a1SJohn Groves 104099c81a1SJohn Groves static const struct dax_operations dev_dax_ops = { 105099c81a1SJohn Groves .direct_access = fsdev_dax_direct_access, 106099c81a1SJohn Groves .zero_page_range = fsdev_dax_zero_page_range, 107099c81a1SJohn Groves .recovery_write = fsdev_dax_recovery_write, 108099c81a1SJohn Groves }; 109099c81a1SJohn Groves 110d5406bd4SJohn Groves static void fsdev_cdev_del(void *cdev) 111d5406bd4SJohn Groves { 112d5406bd4SJohn Groves cdev_del(cdev); 113d5406bd4SJohn Groves } 114d5406bd4SJohn Groves 115d5406bd4SJohn Groves static void fsdev_kill(void *dev_dax) 116d5406bd4SJohn Groves { 117d5406bd4SJohn Groves kill_dev_dax(dev_dax); 118d5406bd4SJohn Groves } 119d5406bd4SJohn Groves 120*700ecbc1SJohn Groves static void fsdev_clear_ops(void *data) 121*700ecbc1SJohn Groves { 122*700ecbc1SJohn Groves struct dev_dax *dev_dax = data; 123*700ecbc1SJohn Groves 124*700ecbc1SJohn Groves dax_set_ops(dev_dax->dax_dev, NULL); 125*700ecbc1SJohn Groves } 126*700ecbc1SJohn Groves 127d5406bd4SJohn Groves /* 128d5406bd4SJohn Groves * Page map operations for FS-DAX mode 129d5406bd4SJohn Groves * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c 130d5406bd4SJohn Groves * 131d5406bd4SJohn Groves * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. 132d5406bd4SJohn Groves * The core mm code in free_zone_device_folio() handles the wake_up_var() 133d5406bd4SJohn Groves * directly for this memory type. 134d5406bd4SJohn Groves */ 135d5406bd4SJohn Groves static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, 136d5406bd4SJohn Groves unsigned long pfn, unsigned long nr_pages, int mf_flags) 137d5406bd4SJohn Groves { 138d5406bd4SJohn Groves struct dev_dax *dev_dax = pgmap->owner; 139d5406bd4SJohn Groves u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; 140d5406bd4SJohn Groves u64 len = nr_pages << PAGE_SHIFT; 141d5406bd4SJohn Groves 142d5406bd4SJohn Groves return dax_holder_notify_failure(dev_dax->dax_dev, offset, 143d5406bd4SJohn Groves len, mf_flags); 144d5406bd4SJohn Groves } 145d5406bd4SJohn Groves 146d5406bd4SJohn Groves static const struct dev_pagemap_ops fsdev_pagemap_ops = { 147d5406bd4SJohn Groves .memory_failure = fsdev_pagemap_memory_failure, 148d5406bd4SJohn Groves }; 149d5406bd4SJohn Groves 150d5406bd4SJohn Groves /* 151d5406bd4SJohn Groves * Clear any stale folio state from pages in the given range. 152d5406bd4SJohn Groves * This is necessary because device_dax pre-initializes compound folios 153d5406bd4SJohn Groves * based on vmemmap_shift, and that state may persist after driver unbind. 154d5406bd4SJohn Groves * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax 155d5406bd4SJohn Groves * expects to find clean order-0 folios that it can build into compound 156d5406bd4SJohn Groves * folios on demand. 157d5406bd4SJohn Groves * 158d5406bd4SJohn Groves * At probe time, no filesystem should be mounted yet, so all mappings 159d5406bd4SJohn Groves * are stale and must be cleared along with compound state. 160d5406bd4SJohn Groves */ 161d5406bd4SJohn Groves static void fsdev_clear_folio_state(struct dev_dax *dev_dax) 162d5406bd4SJohn Groves { 163d5406bd4SJohn Groves for (int i = 0; i < dev_dax->nr_range; i++) { 164d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 165d5406bd4SJohn Groves unsigned long pfn = PHYS_PFN(range->start); 166d5406bd4SJohn Groves unsigned long end_pfn = PHYS_PFN(range->end) + 1; 167d5406bd4SJohn Groves 168d5406bd4SJohn Groves while (pfn < end_pfn) { 169d5406bd4SJohn Groves struct folio *folio = pfn_folio(pfn); 170d5406bd4SJohn Groves int order = dax_folio_reset_order(folio); 171d5406bd4SJohn Groves 172d5406bd4SJohn Groves pfn += 1UL << order; 173d5406bd4SJohn Groves } 174d5406bd4SJohn Groves } 175d5406bd4SJohn Groves } 176d5406bd4SJohn Groves 177d5406bd4SJohn Groves static void fsdev_clear_folio_state_action(void *data) 178d5406bd4SJohn Groves { 179d5406bd4SJohn Groves fsdev_clear_folio_state(data); 180d5406bd4SJohn Groves } 181d5406bd4SJohn Groves 182d5406bd4SJohn Groves static int fsdev_open(struct inode *inode, struct file *filp) 183d5406bd4SJohn Groves { 184d5406bd4SJohn Groves struct dax_device *dax_dev = inode_dax(inode); 185d5406bd4SJohn Groves struct dev_dax *dev_dax = dax_get_private(dax_dev); 186d5406bd4SJohn Groves 187d5406bd4SJohn Groves filp->private_data = dev_dax; 188d5406bd4SJohn Groves 189d5406bd4SJohn Groves return 0; 190d5406bd4SJohn Groves } 191d5406bd4SJohn Groves 192d5406bd4SJohn Groves static int fsdev_release(struct inode *inode, struct file *filp) 193d5406bd4SJohn Groves { 194d5406bd4SJohn Groves return 0; 195d5406bd4SJohn Groves } 196d5406bd4SJohn Groves 197d5406bd4SJohn Groves static const struct file_operations fsdev_fops = { 198d5406bd4SJohn Groves .llseek = noop_llseek, 199d5406bd4SJohn Groves .owner = THIS_MODULE, 200d5406bd4SJohn Groves .open = fsdev_open, 201d5406bd4SJohn Groves .release = fsdev_release, 202d5406bd4SJohn Groves }; 203d5406bd4SJohn Groves 204d5406bd4SJohn Groves static int fsdev_dax_probe(struct dev_dax *dev_dax) 205d5406bd4SJohn Groves { 206d5406bd4SJohn Groves struct dax_device *dax_dev = dev_dax->dax_dev; 207d5406bd4SJohn Groves struct device *dev = &dev_dax->dev; 208d5406bd4SJohn Groves struct dev_pagemap *pgmap; 209d5406bd4SJohn Groves struct inode *inode; 21075945584SJohn Groves u64 data_offset = 0; 211d5406bd4SJohn Groves struct cdev *cdev; 212d5406bd4SJohn Groves void *addr; 213d5406bd4SJohn Groves int rc, i; 214d5406bd4SJohn Groves 215d5406bd4SJohn Groves if (static_dev_dax(dev_dax)) { 216d5406bd4SJohn Groves if (dev_dax->nr_range > 1) { 217d5406bd4SJohn Groves dev_warn(dev, "static pgmap / multi-range device conflict\n"); 218d5406bd4SJohn Groves return -EINVAL; 219d5406bd4SJohn Groves } 220d5406bd4SJohn Groves 221d5406bd4SJohn Groves pgmap = dev_dax->pgmap; 222d5406bd4SJohn Groves } else { 223d5406bd4SJohn Groves size_t pgmap_size; 224d5406bd4SJohn Groves 225d5406bd4SJohn Groves if (dev_dax->pgmap) { 226d5406bd4SJohn Groves dev_warn(dev, "dynamic-dax with pre-populated page map\n"); 227d5406bd4SJohn Groves return -EINVAL; 228d5406bd4SJohn Groves } 229d5406bd4SJohn Groves 230d5406bd4SJohn Groves pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); 231d5406bd4SJohn Groves pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); 232d5406bd4SJohn Groves if (!pgmap) 233d5406bd4SJohn Groves return -ENOMEM; 234d5406bd4SJohn Groves 235d5406bd4SJohn Groves pgmap->nr_range = dev_dax->nr_range; 236d5406bd4SJohn Groves dev_dax->pgmap = pgmap; 237d5406bd4SJohn Groves 238d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 239d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 240d5406bd4SJohn Groves 241d5406bd4SJohn Groves pgmap->ranges[i] = *range; 242d5406bd4SJohn Groves } 243d5406bd4SJohn Groves } 244d5406bd4SJohn Groves 245d5406bd4SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) { 246d5406bd4SJohn Groves struct range *range = &dev_dax->ranges[i].range; 247d5406bd4SJohn Groves 248d5406bd4SJohn Groves if (!devm_request_mem_region(dev, range->start, 249d5406bd4SJohn Groves range_len(range), dev_name(dev))) { 250d5406bd4SJohn Groves dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", 251d5406bd4SJohn Groves i, range->start, range->end); 252d5406bd4SJohn Groves return -EBUSY; 253d5406bd4SJohn Groves } 254d5406bd4SJohn Groves } 255d5406bd4SJohn Groves 256099c81a1SJohn Groves /* Cache size now; it cannot change while driver is bound */ 257099c81a1SJohn Groves dev_dax->cached_size = 0; 258099c81a1SJohn Groves for (i = 0; i < dev_dax->nr_range; i++) 259099c81a1SJohn Groves dev_dax->cached_size += range_len(&dev_dax->ranges[i].range); 260099c81a1SJohn Groves 261d5406bd4SJohn Groves /* 262d5406bd4SJohn Groves * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving 263d5406bd4SJohn Groves * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this 264d5406bd4SJohn Groves * lets fs-dax dynamically build compound folios as needed, similar 265d5406bd4SJohn Groves * to pmem behavior. 266d5406bd4SJohn Groves */ 267d5406bd4SJohn Groves pgmap->type = MEMORY_DEVICE_FS_DAX; 268d5406bd4SJohn Groves pgmap->ops = &fsdev_pagemap_ops; 269d5406bd4SJohn Groves pgmap->owner = dev_dax; 270d5406bd4SJohn Groves 271d5406bd4SJohn Groves addr = devm_memremap_pages(dev, pgmap); 272d5406bd4SJohn Groves if (IS_ERR(addr)) 273d5406bd4SJohn Groves return PTR_ERR(addr); 274d5406bd4SJohn Groves 275d5406bd4SJohn Groves /* 276d5406bd4SJohn Groves * Clear any stale compound folio state left over from a previous 277d5406bd4SJohn Groves * driver (e.g., device_dax with vmemmap_shift). Also register this 278d5406bd4SJohn Groves * as a devm action so folio state is cleared on unbind, ensuring 279d5406bd4SJohn Groves * clean pages for subsequent drivers (e.g., kmem for system-ram). 280d5406bd4SJohn Groves */ 281d5406bd4SJohn Groves fsdev_clear_folio_state(dev_dax); 282d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, 283d5406bd4SJohn Groves dev_dax); 284d5406bd4SJohn Groves if (rc) 285d5406bd4SJohn Groves return rc; 286d5406bd4SJohn Groves 287d5406bd4SJohn Groves /* Detect whether the data is at a non-zero offset into the memory */ 288d5406bd4SJohn Groves if (pgmap->range.start != dev_dax->ranges[0].range.start) { 289d5406bd4SJohn Groves u64 phys = dev_dax->ranges[0].range.start; 290d5406bd4SJohn Groves u64 pgmap_phys = dev_dax->pgmap[0].range.start; 291d5406bd4SJohn Groves 292d5406bd4SJohn Groves if (!WARN_ON(pgmap_phys > phys)) 293d5406bd4SJohn Groves data_offset = phys - pgmap_phys; 294d5406bd4SJohn Groves 295d5406bd4SJohn Groves pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", 296d5406bd4SJohn Groves __func__, phys, pgmap_phys, data_offset); 297d5406bd4SJohn Groves } 29875945584SJohn Groves dev_dax->virt_addr = addr + data_offset; 299d5406bd4SJohn Groves 300d5406bd4SJohn Groves inode = dax_inode(dax_dev); 301d5406bd4SJohn Groves cdev = inode->i_cdev; 302d5406bd4SJohn Groves cdev_init(cdev, &fsdev_fops); 303d5406bd4SJohn Groves cdev->owner = dev->driver->owner; 304d5406bd4SJohn Groves cdev_set_parent(cdev, &dev->kobj); 305d5406bd4SJohn Groves rc = cdev_add(cdev, dev->devt, 1); 306d5406bd4SJohn Groves if (rc) 307d5406bd4SJohn Groves return rc; 308d5406bd4SJohn Groves 309d5406bd4SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); 310d5406bd4SJohn Groves if (rc) 311d5406bd4SJohn Groves return rc; 312d5406bd4SJohn Groves 313*700ecbc1SJohn Groves /* Set the dax operations for fs-dax access path */ 314*700ecbc1SJohn Groves rc = dax_set_ops(dax_dev, &dev_dax_ops); 315*700ecbc1SJohn Groves if (rc) 316*700ecbc1SJohn Groves return rc; 317*700ecbc1SJohn Groves 318*700ecbc1SJohn Groves rc = devm_add_action_or_reset(dev, fsdev_clear_ops, dev_dax); 319*700ecbc1SJohn Groves if (rc) 320*700ecbc1SJohn Groves return rc; 321*700ecbc1SJohn Groves 322d5406bd4SJohn Groves run_dax(dax_dev); 323d5406bd4SJohn Groves return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); 324d5406bd4SJohn Groves } 325d5406bd4SJohn Groves 326d5406bd4SJohn Groves static struct dax_device_driver fsdev_dax_driver = { 327d5406bd4SJohn Groves .probe = fsdev_dax_probe, 328d5406bd4SJohn Groves .type = DAXDRV_FSDEV_TYPE, 329d5406bd4SJohn Groves }; 330d5406bd4SJohn Groves 331d5406bd4SJohn Groves static int __init dax_init(void) 332d5406bd4SJohn Groves { 333d5406bd4SJohn Groves return dax_driver_register(&fsdev_dax_driver); 334d5406bd4SJohn Groves } 335d5406bd4SJohn Groves 336d5406bd4SJohn Groves static void __exit dax_exit(void) 337d5406bd4SJohn Groves { 338d5406bd4SJohn Groves dax_driver_unregister(&fsdev_dax_driver); 339d5406bd4SJohn Groves } 340d5406bd4SJohn Groves 341d5406bd4SJohn Groves MODULE_AUTHOR("John Groves"); 342d5406bd4SJohn Groves MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); 343d5406bd4SJohn Groves MODULE_LICENSE("GPL"); 344d5406bd4SJohn Groves module_init(dax_init); 345d5406bd4SJohn Groves module_exit(dax_exit); 346d5406bd4SJohn Groves MODULE_ALIAS_DAX_DEVICE(0); 347