1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/blk-mq.h> 29 #include <linux/pfn_t.h> 30 #include <linux/slab.h> 31 #include <linux/pmem.h> 32 #include <linux/dax.h> 33 #include <linux/nd.h> 34 #include "pmem.h" 35 #include "pfn.h" 36 #include "nd.h" 37 38 static struct device *to_dev(struct pmem_device *pmem) 39 { 40 /* 41 * nvdimm bus services need a 'dev' parameter, and we record the device 42 * at init in bb.dev. 43 */ 44 return pmem->bb.dev; 45 } 46 47 static struct nd_region *to_region(struct pmem_device *pmem) 48 { 49 return to_nd_region(to_dev(pmem)->parent); 50 } 51 52 static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, 53 unsigned int len) 54 { 55 struct device *dev = to_dev(pmem); 56 sector_t sector; 57 long cleared; 58 int rc = 0; 59 60 sector = (offset - pmem->data_offset) / 512; 61 62 cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 63 if (cleared < len) 64 rc = -EIO; 65 if (cleared > 0 && cleared / 512) { 66 cleared /= 512; 67 dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, 68 (unsigned long long) sector, cleared, 69 cleared > 1 ? "s" : ""); 70 badblocks_clear(&pmem->bb, sector, cleared); 71 } 72 73 invalidate_pmem(pmem->virt_addr + offset, len); 74 75 return rc; 76 } 77 78 static void write_pmem(void *pmem_addr, struct page *page, 79 unsigned int off, unsigned int len) 80 { 81 void *mem = kmap_atomic(page); 82 83 memcpy_to_pmem(pmem_addr, mem + off, len); 84 kunmap_atomic(mem); 85 } 86 87 static int read_pmem(struct page *page, unsigned int off, 88 void *pmem_addr, unsigned int len) 89 { 90 int rc; 91 void *mem = kmap_atomic(page); 92 93 rc = memcpy_mcsafe(mem + off, pmem_addr, len); 94 kunmap_atomic(mem); 95 if (rc) 96 return -EIO; 97 return 0; 98 } 99 100 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 101 unsigned int len, unsigned int off, bool is_write, 102 sector_t sector) 103 { 104 int rc = 0; 105 bool bad_pmem = false; 106 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 107 void *pmem_addr = pmem->virt_addr + pmem_off; 108 109 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 110 bad_pmem = true; 111 112 if (!is_write) { 113 if (unlikely(bad_pmem)) 114 rc = -EIO; 115 else { 116 rc = read_pmem(page, off, pmem_addr, len); 117 flush_dcache_page(page); 118 } 119 } else { 120 /* 121 * Note that we write the data both before and after 122 * clearing poison. The write before clear poison 123 * handles situations where the latest written data is 124 * preserved and the clear poison operation simply marks 125 * the address range as valid without changing the data. 126 * In this case application software can assume that an 127 * interrupted write will either return the new good 128 * data or an error. 129 * 130 * However, if pmem_clear_poison() leaves the data in an 131 * indeterminate state we need to perform the write 132 * after clear poison. 133 */ 134 flush_dcache_page(page); 135 write_pmem(pmem_addr, page, off, len); 136 if (unlikely(bad_pmem)) { 137 rc = pmem_clear_poison(pmem, pmem_off, len); 138 write_pmem(pmem_addr, page, off, len); 139 } 140 } 141 142 return rc; 143 } 144 145 /* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */ 146 #ifndef REQ_FLUSH 147 #define REQ_FLUSH REQ_PREFLUSH 148 #endif 149 150 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 151 { 152 int rc = 0; 153 bool do_acct; 154 unsigned long start; 155 struct bio_vec bvec; 156 struct bvec_iter iter; 157 struct pmem_device *pmem = q->queuedata; 158 struct nd_region *nd_region = to_region(pmem); 159 160 if (bio->bi_opf & REQ_FLUSH) 161 nvdimm_flush(nd_region); 162 163 do_acct = nd_iostat_start(bio, &start); 164 bio_for_each_segment(bvec, bio, iter) { 165 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 166 bvec.bv_offset, op_is_write(bio_op(bio)), 167 iter.bi_sector); 168 if (rc) { 169 bio->bi_error = rc; 170 break; 171 } 172 } 173 if (do_acct) 174 nd_iostat_end(bio, start); 175 176 if (bio->bi_opf & REQ_FUA) 177 nvdimm_flush(nd_region); 178 179 bio_endio(bio); 180 return BLK_QC_T_NONE; 181 } 182 183 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 184 struct page *page, bool is_write) 185 { 186 struct pmem_device *pmem = bdev->bd_queue->queuedata; 187 int rc; 188 189 rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); 190 191 /* 192 * The ->rw_page interface is subtle and tricky. The core 193 * retries on any error, so we can only invoke page_endio() in 194 * the successful completion case. Otherwise, we'll see crashes 195 * caused by double completion. 196 */ 197 if (rc == 0) 198 page_endio(page, is_write, 0); 199 200 return rc; 201 } 202 203 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 204 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, 205 long nr_pages, void **kaddr, pfn_t *pfn) 206 { 207 resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; 208 209 if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, 210 PFN_PHYS(nr_pages)))) 211 return -EIO; 212 *kaddr = pmem->virt_addr + offset; 213 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 214 215 /* 216 * If badblocks are present, limit known good range to the 217 * requested range. 218 */ 219 if (unlikely(pmem->bb.count)) 220 return nr_pages; 221 return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); 222 } 223 224 static const struct block_device_operations pmem_fops = { 225 .owner = THIS_MODULE, 226 .rw_page = pmem_rw_page, 227 .revalidate_disk = nvdimm_revalidate_disk, 228 }; 229 230 static long pmem_dax_direct_access(struct dax_device *dax_dev, 231 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) 232 { 233 struct pmem_device *pmem = dax_get_private(dax_dev); 234 235 return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); 236 } 237 238 static const struct dax_operations pmem_dax_ops = { 239 .direct_access = pmem_dax_direct_access, 240 }; 241 242 static void pmem_release_queue(void *q) 243 { 244 blk_cleanup_queue(q); 245 } 246 247 static void pmem_freeze_queue(void *q) 248 { 249 blk_freeze_queue_start(q); 250 } 251 252 static void pmem_release_disk(void *__pmem) 253 { 254 struct pmem_device *pmem = __pmem; 255 256 kill_dax(pmem->dax_dev); 257 put_dax(pmem->dax_dev); 258 del_gendisk(pmem->disk); 259 put_disk(pmem->disk); 260 } 261 262 static int pmem_attach_disk(struct device *dev, 263 struct nd_namespace_common *ndns) 264 { 265 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 266 struct nd_region *nd_region = to_nd_region(dev->parent); 267 struct vmem_altmap __altmap, *altmap = NULL; 268 struct resource *res = &nsio->res; 269 struct nd_pfn *nd_pfn = NULL; 270 struct dax_device *dax_dev; 271 int nid = dev_to_node(dev); 272 struct nd_pfn_sb *pfn_sb; 273 struct pmem_device *pmem; 274 struct resource pfn_res; 275 struct request_queue *q; 276 struct gendisk *disk; 277 void *addr; 278 279 /* while nsio_rw_bytes is active, parse a pfn info block if present */ 280 if (is_nd_pfn(dev)) { 281 nd_pfn = to_nd_pfn(dev); 282 altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap); 283 if (IS_ERR(altmap)) 284 return PTR_ERR(altmap); 285 } 286 287 /* we're attaching a block device, disable raw namespace access */ 288 devm_nsio_disable(dev, nsio); 289 290 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 291 if (!pmem) 292 return -ENOMEM; 293 294 dev_set_drvdata(dev, pmem); 295 pmem->phys_addr = res->start; 296 pmem->size = resource_size(res); 297 if (nvdimm_has_flush(nd_region) < 0) 298 dev_warn(dev, "unable to guarantee persistence of writes\n"); 299 300 if (!devm_request_mem_region(dev, res->start, resource_size(res), 301 dev_name(&ndns->dev))) { 302 dev_warn(dev, "could not reserve region %pR\n", res); 303 return -EBUSY; 304 } 305 306 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 307 if (!q) 308 return -ENOMEM; 309 310 if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 311 return -ENOMEM; 312 313 pmem->pfn_flags = PFN_DEV; 314 if (is_nd_pfn(dev)) { 315 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 316 altmap); 317 pfn_sb = nd_pfn->pfn_sb; 318 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); 319 pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res); 320 pmem->pfn_flags |= PFN_MAP; 321 res = &pfn_res; /* for badblocks populate */ 322 res->start += pmem->data_offset; 323 } else if (pmem_should_map_pages(dev)) { 324 addr = devm_memremap_pages(dev, &nsio->res, 325 &q->q_usage_counter, NULL); 326 pmem->pfn_flags |= PFN_MAP; 327 } else 328 addr = devm_memremap(dev, pmem->phys_addr, 329 pmem->size, ARCH_MEMREMAP_PMEM); 330 331 /* 332 * At release time the queue must be frozen before 333 * devm_memremap_pages is unwound 334 */ 335 if (devm_add_action_or_reset(dev, pmem_freeze_queue, q)) 336 return -ENOMEM; 337 338 if (IS_ERR(addr)) 339 return PTR_ERR(addr); 340 pmem->virt_addr = addr; 341 342 blk_queue_write_cache(q, true, true); 343 blk_queue_make_request(q, pmem_make_request); 344 blk_queue_physical_block_size(q, PAGE_SIZE); 345 blk_queue_max_hw_sectors(q, UINT_MAX); 346 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 347 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 348 queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); 349 q->queuedata = pmem; 350 351 disk = alloc_disk_node(0, nid); 352 if (!disk) 353 return -ENOMEM; 354 pmem->disk = disk; 355 356 disk->fops = &pmem_fops; 357 disk->queue = q; 358 disk->flags = GENHD_FL_EXT_DEVT; 359 nvdimm_namespace_disk_name(ndns, disk->disk_name); 360 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 361 / 512); 362 if (devm_init_badblocks(dev, &pmem->bb)) 363 return -ENOMEM; 364 nvdimm_badblocks_populate(nd_region, &pmem->bb, res); 365 disk->bb = &pmem->bb; 366 367 dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); 368 if (!dax_dev) { 369 put_disk(disk); 370 return -ENOMEM; 371 } 372 pmem->dax_dev = dax_dev; 373 374 device_add_disk(dev, disk); 375 if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) 376 return -ENOMEM; 377 378 revalidate_disk(disk); 379 380 return 0; 381 } 382 383 static int nd_pmem_probe(struct device *dev) 384 { 385 struct nd_namespace_common *ndns; 386 387 ndns = nvdimm_namespace_common_probe(dev); 388 if (IS_ERR(ndns)) 389 return PTR_ERR(ndns); 390 391 if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev))) 392 return -ENXIO; 393 394 if (is_nd_btt(dev)) 395 return nvdimm_namespace_attach_btt(ndns); 396 397 if (is_nd_pfn(dev)) 398 return pmem_attach_disk(dev, ndns); 399 400 /* if we find a valid info-block we'll come back as that personality */ 401 if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0 402 || nd_dax_probe(dev, ndns) == 0) 403 return -ENXIO; 404 405 /* ...otherwise we're just a raw pmem device */ 406 return pmem_attach_disk(dev, ndns); 407 } 408 409 static int nd_pmem_remove(struct device *dev) 410 { 411 if (is_nd_btt(dev)) 412 nvdimm_namespace_detach_btt(to_nd_btt(dev)); 413 nvdimm_flush(to_nd_region(dev->parent)); 414 415 return 0; 416 } 417 418 static void nd_pmem_shutdown(struct device *dev) 419 { 420 nvdimm_flush(to_nd_region(dev->parent)); 421 } 422 423 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 424 { 425 struct nd_region *nd_region; 426 resource_size_t offset = 0, end_trunc = 0; 427 struct nd_namespace_common *ndns; 428 struct nd_namespace_io *nsio; 429 struct resource res; 430 struct badblocks *bb; 431 432 if (event != NVDIMM_REVALIDATE_POISON) 433 return; 434 435 if (is_nd_btt(dev)) { 436 struct nd_btt *nd_btt = to_nd_btt(dev); 437 438 ndns = nd_btt->ndns; 439 nd_region = to_nd_region(ndns->dev.parent); 440 nsio = to_nd_namespace_io(&ndns->dev); 441 bb = &nsio->bb; 442 } else { 443 struct pmem_device *pmem = dev_get_drvdata(dev); 444 445 nd_region = to_region(pmem); 446 bb = &pmem->bb; 447 448 if (is_nd_pfn(dev)) { 449 struct nd_pfn *nd_pfn = to_nd_pfn(dev); 450 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 451 452 ndns = nd_pfn->ndns; 453 offset = pmem->data_offset + 454 __le32_to_cpu(pfn_sb->start_pad); 455 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 456 } else { 457 ndns = to_ndns(dev); 458 } 459 460 nsio = to_nd_namespace_io(&ndns->dev); 461 } 462 463 res.start = nsio->res.start + offset; 464 res.end = nsio->res.end - end_trunc; 465 nvdimm_badblocks_populate(nd_region, bb, &res); 466 } 467 468 MODULE_ALIAS("pmem"); 469 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 470 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 471 static struct nd_device_driver nd_pmem_driver = { 472 .probe = nd_pmem_probe, 473 .remove = nd_pmem_remove, 474 .notify = nd_pmem_notify, 475 .shutdown = nd_pmem_shutdown, 476 .drv = { 477 .name = "nd_pmem", 478 }, 479 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 480 }; 481 482 static int __init pmem_init(void) 483 { 484 return nd_driver_register(&nd_pmem_driver); 485 } 486 module_init(pmem_init); 487 488 static void pmem_exit(void) 489 { 490 driver_unregister(&nd_pmem_driver.drv); 491 } 492 module_exit(pmem_exit); 493 494 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 495 MODULE_LICENSE("GPL v2"); 496