1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pfn_t.h> 29 #include <linux/slab.h> 30 #include <linux/pmem.h> 31 #include <linux/nd.h> 32 #include "pfn.h" 33 #include "nd.h" 34 35 struct pmem_device { 36 /* One contiguous memory region per device */ 37 phys_addr_t phys_addr; 38 /* when non-zero this device is hosting a 'pfn' instance */ 39 phys_addr_t data_offset; 40 u64 pfn_flags; 41 void __pmem *virt_addr; 42 /* immutable base size of the namespace */ 43 size_t size; 44 /* trim size when namespace capacity has been section aligned */ 45 u32 pfn_pad; 46 struct badblocks bb; 47 }; 48 49 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, 50 unsigned int len) 51 { 52 struct device *dev = pmem->bb.dev; 53 sector_t sector; 54 long cleared; 55 56 sector = (offset - pmem->data_offset) / 512; 57 cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 58 59 if (cleared > 0 && cleared / 512) { 60 dev_dbg(dev, "%s: %llx clear %ld sector%s\n", 61 __func__, (unsigned long long) sector, 62 cleared / 512, cleared / 512 > 1 ? "s" : ""); 63 badblocks_clear(&pmem->bb, sector, cleared / 512); 64 } 65 invalidate_pmem(pmem->virt_addr + offset, len); 66 } 67 68 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 69 unsigned int len, unsigned int off, int rw, 70 sector_t sector) 71 { 72 int rc = 0; 73 bool bad_pmem = false; 74 void *mem = kmap_atomic(page); 75 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 76 void __pmem *pmem_addr = pmem->virt_addr + pmem_off; 77 78 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 79 bad_pmem = true; 80 81 if (rw == READ) { 82 if (unlikely(bad_pmem)) 83 rc = -EIO; 84 else { 85 rc = memcpy_from_pmem(mem + off, pmem_addr, len); 86 flush_dcache_page(page); 87 } 88 } else { 89 /* 90 * Note that we write the data both before and after 91 * clearing poison. The write before clear poison 92 * handles situations where the latest written data is 93 * preserved and the clear poison operation simply marks 94 * the address range as valid without changing the data. 95 * In this case application software can assume that an 96 * interrupted write will either return the new good 97 * data or an error. 98 * 99 * However, if pmem_clear_poison() leaves the data in an 100 * indeterminate state we need to perform the write 101 * after clear poison. 102 */ 103 flush_dcache_page(page); 104 memcpy_to_pmem(pmem_addr, mem + off, len); 105 if (unlikely(bad_pmem)) { 106 pmem_clear_poison(pmem, pmem_off, len); 107 memcpy_to_pmem(pmem_addr, mem + off, len); 108 } 109 } 110 111 kunmap_atomic(mem); 112 return rc; 113 } 114 115 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 116 { 117 int rc = 0; 118 bool do_acct; 119 unsigned long start; 120 struct bio_vec bvec; 121 struct bvec_iter iter; 122 struct pmem_device *pmem = q->queuedata; 123 124 do_acct = nd_iostat_start(bio, &start); 125 bio_for_each_segment(bvec, bio, iter) { 126 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 127 bvec.bv_offset, bio_data_dir(bio), 128 iter.bi_sector); 129 if (rc) { 130 bio->bi_error = rc; 131 break; 132 } 133 } 134 if (do_acct) 135 nd_iostat_end(bio, start); 136 137 if (bio_data_dir(bio)) 138 wmb_pmem(); 139 140 bio_endio(bio); 141 return BLK_QC_T_NONE; 142 } 143 144 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 145 struct page *page, int rw) 146 { 147 struct pmem_device *pmem = bdev->bd_queue->queuedata; 148 int rc; 149 150 rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector); 151 if (rw & WRITE) 152 wmb_pmem(); 153 154 /* 155 * The ->rw_page interface is subtle and tricky. The core 156 * retries on any error, so we can only invoke page_endio() in 157 * the successful completion case. Otherwise, we'll see crashes 158 * caused by double completion. 159 */ 160 if (rc == 0) 161 page_endio(page, rw & WRITE, 0); 162 163 return rc; 164 } 165 166 static long pmem_direct_access(struct block_device *bdev, sector_t sector, 167 void __pmem **kaddr, pfn_t *pfn, long size) 168 { 169 struct pmem_device *pmem = bdev->bd_queue->queuedata; 170 resource_size_t offset = sector * 512 + pmem->data_offset; 171 172 if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 173 return -EIO; 174 *kaddr = pmem->virt_addr + offset; 175 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 176 177 /* 178 * If badblocks are present, limit known good range to the 179 * requested range. 180 */ 181 if (unlikely(pmem->bb.count)) 182 return size; 183 return pmem->size - pmem->pfn_pad - offset; 184 } 185 186 static const struct block_device_operations pmem_fops = { 187 .owner = THIS_MODULE, 188 .rw_page = pmem_rw_page, 189 .direct_access = pmem_direct_access, 190 .revalidate_disk = nvdimm_revalidate_disk, 191 }; 192 193 static void pmem_release_queue(void *q) 194 { 195 blk_cleanup_queue(q); 196 } 197 198 void pmem_release_disk(void *disk) 199 { 200 del_gendisk(disk); 201 put_disk(disk); 202 } 203 204 static int pmem_attach_disk(struct device *dev, 205 struct nd_namespace_common *ndns) 206 { 207 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 208 struct vmem_altmap __altmap, *altmap = NULL; 209 struct resource *res = &nsio->res; 210 struct nd_pfn *nd_pfn = NULL; 211 int nid = dev_to_node(dev); 212 struct nd_pfn_sb *pfn_sb; 213 struct pmem_device *pmem; 214 struct resource pfn_res; 215 struct request_queue *q; 216 struct gendisk *disk; 217 void *addr; 218 219 /* while nsio_rw_bytes is active, parse a pfn info block if present */ 220 if (is_nd_pfn(dev)) { 221 nd_pfn = to_nd_pfn(dev); 222 altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap); 223 if (IS_ERR(altmap)) 224 return PTR_ERR(altmap); 225 } 226 227 /* we're attaching a block device, disable raw namespace access */ 228 devm_nsio_disable(dev, nsio); 229 230 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 231 if (!pmem) 232 return -ENOMEM; 233 234 dev_set_drvdata(dev, pmem); 235 pmem->phys_addr = res->start; 236 pmem->size = resource_size(res); 237 if (!arch_has_wmb_pmem()) 238 dev_warn(dev, "unable to guarantee persistence of writes\n"); 239 240 if (!devm_request_mem_region(dev, res->start, resource_size(res), 241 dev_name(dev))) { 242 dev_warn(dev, "could not reserve region %pR\n", res); 243 return -EBUSY; 244 } 245 246 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 247 if (!q) 248 return -ENOMEM; 249 250 pmem->pfn_flags = PFN_DEV; 251 if (is_nd_pfn(dev)) { 252 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 253 altmap); 254 pfn_sb = nd_pfn->pfn_sb; 255 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); 256 pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res); 257 pmem->pfn_flags |= PFN_MAP; 258 res = &pfn_res; /* for badblocks populate */ 259 res->start += pmem->data_offset; 260 } else if (pmem_should_map_pages(dev)) { 261 addr = devm_memremap_pages(dev, &nsio->res, 262 &q->q_usage_counter, NULL); 263 pmem->pfn_flags |= PFN_MAP; 264 } else 265 addr = devm_memremap(dev, pmem->phys_addr, 266 pmem->size, ARCH_MEMREMAP_PMEM); 267 268 /* 269 * At release time the queue must be dead before 270 * devm_memremap_pages is unwound 271 */ 272 if (devm_add_action(dev, pmem_release_queue, q)) { 273 blk_cleanup_queue(q); 274 return -ENOMEM; 275 } 276 277 if (IS_ERR(addr)) 278 return PTR_ERR(addr); 279 pmem->virt_addr = (void __pmem *) addr; 280 281 blk_queue_make_request(q, pmem_make_request); 282 blk_queue_physical_block_size(q, PAGE_SIZE); 283 blk_queue_max_hw_sectors(q, UINT_MAX); 284 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 285 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 286 q->queuedata = pmem; 287 288 disk = alloc_disk_node(0, nid); 289 if (!disk) 290 return -ENOMEM; 291 if (devm_add_action(dev, pmem_release_disk, disk)) { 292 put_disk(disk); 293 return -ENOMEM; 294 } 295 296 disk->fops = &pmem_fops; 297 disk->queue = q; 298 disk->flags = GENHD_FL_EXT_DEVT; 299 nvdimm_namespace_disk_name(ndns, disk->disk_name); 300 disk->driverfs_dev = dev; 301 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 302 / 512); 303 if (devm_init_badblocks(dev, &pmem->bb)) 304 return -ENOMEM; 305 nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res); 306 disk->bb = &pmem->bb; 307 add_disk(disk); 308 revalidate_disk(disk); 309 310 return 0; 311 } 312 313 static int nd_pmem_probe(struct device *dev) 314 { 315 struct nd_namespace_common *ndns; 316 317 ndns = nvdimm_namespace_common_probe(dev); 318 if (IS_ERR(ndns)) 319 return PTR_ERR(ndns); 320 321 if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev))) 322 return -ENXIO; 323 324 if (is_nd_btt(dev)) 325 return nvdimm_namespace_attach_btt(ndns); 326 327 if (is_nd_pfn(dev)) 328 return pmem_attach_disk(dev, ndns); 329 330 /* if we find a valid info-block we'll come back as that personality */ 331 if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0 332 || nd_dax_probe(dev, ndns) == 0) 333 return -ENXIO; 334 335 /* ...otherwise we're just a raw pmem device */ 336 return pmem_attach_disk(dev, ndns); 337 } 338 339 static int nd_pmem_remove(struct device *dev) 340 { 341 if (is_nd_btt(dev)) 342 nvdimm_namespace_detach_btt(to_nd_btt(dev)); 343 return 0; 344 } 345 346 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 347 { 348 struct nd_region *nd_region = to_nd_region(dev->parent); 349 struct pmem_device *pmem = dev_get_drvdata(dev); 350 resource_size_t offset = 0, end_trunc = 0; 351 struct nd_namespace_common *ndns; 352 struct nd_namespace_io *nsio; 353 struct resource res; 354 355 if (event != NVDIMM_REVALIDATE_POISON) 356 return; 357 358 if (is_nd_btt(dev)) { 359 struct nd_btt *nd_btt = to_nd_btt(dev); 360 361 ndns = nd_btt->ndns; 362 } else if (is_nd_pfn(dev)) { 363 struct nd_pfn *nd_pfn = to_nd_pfn(dev); 364 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 365 366 ndns = nd_pfn->ndns; 367 offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); 368 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 369 } else 370 ndns = to_ndns(dev); 371 372 nsio = to_nd_namespace_io(&ndns->dev); 373 res.start = nsio->res.start + offset; 374 res.end = nsio->res.end - end_trunc; 375 nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); 376 } 377 378 MODULE_ALIAS("pmem"); 379 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 380 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 381 static struct nd_device_driver nd_pmem_driver = { 382 .probe = nd_pmem_probe, 383 .remove = nd_pmem_remove, 384 .notify = nd_pmem_notify, 385 .drv = { 386 .name = "nd_pmem", 387 }, 388 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 389 }; 390 391 static int __init pmem_init(void) 392 { 393 return nd_driver_register(&nd_pmem_driver); 394 } 395 module_init(pmem_init); 396 397 static void pmem_exit(void) 398 { 399 driver_unregister(&nd_pmem_driver.drv); 400 } 401 module_exit(pmem_exit); 402 403 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 404 MODULE_LICENSE("GPL v2"); 405