xref: /linux/drivers/nvdimm/pmem.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * Persistent Memory Driver
3  *
4  * Copyright (c) 2014-2015, Intel Corporation.
5  * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
6  * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  */
17 
18 #include <asm/cacheflush.h>
19 #include <linux/blkdev.h>
20 #include <linux/hdreg.h>
21 #include <linux/init.h>
22 #include <linux/platform_device.h>
23 #include <linux/module.h>
24 #include <linux/moduleparam.h>
25 #include <linux/badblocks.h>
26 #include <linux/memremap.h>
27 #include <linux/vmalloc.h>
28 #include <linux/pfn_t.h>
29 #include <linux/slab.h>
30 #include <linux/pmem.h>
31 #include <linux/nd.h>
32 #include "pfn.h"
33 #include "nd.h"
34 
35 struct pmem_device {
36 	/* One contiguous memory region per device */
37 	phys_addr_t		phys_addr;
38 	/* when non-zero this device is hosting a 'pfn' instance */
39 	phys_addr_t		data_offset;
40 	u64			pfn_flags;
41 	void __pmem		*virt_addr;
42 	/* immutable base size of the namespace */
43 	size_t			size;
44 	/* trim size when namespace capacity has been section aligned */
45 	u32			pfn_pad;
46 	struct badblocks	bb;
47 };
48 
49 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
50 		unsigned int len)
51 {
52 	struct device *dev = pmem->bb.dev;
53 	sector_t sector;
54 	long cleared;
55 
56 	sector = (offset - pmem->data_offset) / 512;
57 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
58 
59 	if (cleared > 0 && cleared / 512) {
60 		dev_dbg(dev, "%s: %llx clear %ld sector%s\n",
61 				__func__, (unsigned long long) sector,
62 				cleared / 512, cleared / 512 > 1 ? "s" : "");
63 		badblocks_clear(&pmem->bb, sector, cleared / 512);
64 	}
65 	invalidate_pmem(pmem->virt_addr + offset, len);
66 }
67 
68 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
69 			unsigned int len, unsigned int off, int rw,
70 			sector_t sector)
71 {
72 	int rc = 0;
73 	bool bad_pmem = false;
74 	void *mem = kmap_atomic(page);
75 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
76 	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
77 
78 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
79 		bad_pmem = true;
80 
81 	if (rw == READ) {
82 		if (unlikely(bad_pmem))
83 			rc = -EIO;
84 		else {
85 			rc = memcpy_from_pmem(mem + off, pmem_addr, len);
86 			flush_dcache_page(page);
87 		}
88 	} else {
89 		/*
90 		 * Note that we write the data both before and after
91 		 * clearing poison.  The write before clear poison
92 		 * handles situations where the latest written data is
93 		 * preserved and the clear poison operation simply marks
94 		 * the address range as valid without changing the data.
95 		 * In this case application software can assume that an
96 		 * interrupted write will either return the new good
97 		 * data or an error.
98 		 *
99 		 * However, if pmem_clear_poison() leaves the data in an
100 		 * indeterminate state we need to perform the write
101 		 * after clear poison.
102 		 */
103 		flush_dcache_page(page);
104 		memcpy_to_pmem(pmem_addr, mem + off, len);
105 		if (unlikely(bad_pmem)) {
106 			pmem_clear_poison(pmem, pmem_off, len);
107 			memcpy_to_pmem(pmem_addr, mem + off, len);
108 		}
109 	}
110 
111 	kunmap_atomic(mem);
112 	return rc;
113 }
114 
115 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
116 {
117 	int rc = 0;
118 	bool do_acct;
119 	unsigned long start;
120 	struct bio_vec bvec;
121 	struct bvec_iter iter;
122 	struct pmem_device *pmem = q->queuedata;
123 
124 	do_acct = nd_iostat_start(bio, &start);
125 	bio_for_each_segment(bvec, bio, iter) {
126 		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
127 				bvec.bv_offset, bio_data_dir(bio),
128 				iter.bi_sector);
129 		if (rc) {
130 			bio->bi_error = rc;
131 			break;
132 		}
133 	}
134 	if (do_acct)
135 		nd_iostat_end(bio, start);
136 
137 	if (bio_data_dir(bio))
138 		wmb_pmem();
139 
140 	bio_endio(bio);
141 	return BLK_QC_T_NONE;
142 }
143 
144 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
145 		       struct page *page, int rw)
146 {
147 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
148 	int rc;
149 
150 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
151 	if (rw & WRITE)
152 		wmb_pmem();
153 
154 	/*
155 	 * The ->rw_page interface is subtle and tricky.  The core
156 	 * retries on any error, so we can only invoke page_endio() in
157 	 * the successful completion case.  Otherwise, we'll see crashes
158 	 * caused by double completion.
159 	 */
160 	if (rc == 0)
161 		page_endio(page, rw & WRITE, 0);
162 
163 	return rc;
164 }
165 
166 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
167 		      void __pmem **kaddr, pfn_t *pfn, long size)
168 {
169 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
170 	resource_size_t offset = sector * 512 + pmem->data_offset;
171 
172 	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
173 		return -EIO;
174 	*kaddr = pmem->virt_addr + offset;
175 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
176 
177 	/*
178 	 * If badblocks are present, limit known good range to the
179 	 * requested range.
180 	 */
181 	if (unlikely(pmem->bb.count))
182 		return size;
183 	return pmem->size - pmem->pfn_pad - offset;
184 }
185 
186 static const struct block_device_operations pmem_fops = {
187 	.owner =		THIS_MODULE,
188 	.rw_page =		pmem_rw_page,
189 	.direct_access =	pmem_direct_access,
190 	.revalidate_disk =	nvdimm_revalidate_disk,
191 };
192 
193 static void pmem_release_queue(void *q)
194 {
195 	blk_cleanup_queue(q);
196 }
197 
198 void pmem_release_disk(void *disk)
199 {
200 	del_gendisk(disk);
201 	put_disk(disk);
202 }
203 
204 static int pmem_attach_disk(struct device *dev,
205 		struct nd_namespace_common *ndns)
206 {
207 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
208 	struct vmem_altmap __altmap, *altmap = NULL;
209 	struct resource *res = &nsio->res;
210 	struct nd_pfn *nd_pfn = NULL;
211 	int nid = dev_to_node(dev);
212 	struct nd_pfn_sb *pfn_sb;
213 	struct pmem_device *pmem;
214 	struct resource pfn_res;
215 	struct request_queue *q;
216 	struct gendisk *disk;
217 	void *addr;
218 
219 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
220 	if (is_nd_pfn(dev)) {
221 		nd_pfn = to_nd_pfn(dev);
222 		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
223 		if (IS_ERR(altmap))
224 			return PTR_ERR(altmap);
225 	}
226 
227 	/* we're attaching a block device, disable raw namespace access */
228 	devm_nsio_disable(dev, nsio);
229 
230 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
231 	if (!pmem)
232 		return -ENOMEM;
233 
234 	dev_set_drvdata(dev, pmem);
235 	pmem->phys_addr = res->start;
236 	pmem->size = resource_size(res);
237 	if (!arch_has_wmb_pmem())
238 		dev_warn(dev, "unable to guarantee persistence of writes\n");
239 
240 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
241 				dev_name(dev))) {
242 		dev_warn(dev, "could not reserve region %pR\n", res);
243 		return -EBUSY;
244 	}
245 
246 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
247 	if (!q)
248 		return -ENOMEM;
249 
250 	pmem->pfn_flags = PFN_DEV;
251 	if (is_nd_pfn(dev)) {
252 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
253 				altmap);
254 		pfn_sb = nd_pfn->pfn_sb;
255 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
256 		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
257 		pmem->pfn_flags |= PFN_MAP;
258 		res = &pfn_res; /* for badblocks populate */
259 		res->start += pmem->data_offset;
260 	} else if (pmem_should_map_pages(dev)) {
261 		addr = devm_memremap_pages(dev, &nsio->res,
262 				&q->q_usage_counter, NULL);
263 		pmem->pfn_flags |= PFN_MAP;
264 	} else
265 		addr = devm_memremap(dev, pmem->phys_addr,
266 				pmem->size, ARCH_MEMREMAP_PMEM);
267 
268 	/*
269 	 * At release time the queue must be dead before
270 	 * devm_memremap_pages is unwound
271 	 */
272 	if (devm_add_action(dev, pmem_release_queue, q)) {
273 		blk_cleanup_queue(q);
274 		return -ENOMEM;
275 	}
276 
277 	if (IS_ERR(addr))
278 		return PTR_ERR(addr);
279 	pmem->virt_addr = (void __pmem *) addr;
280 
281 	blk_queue_make_request(q, pmem_make_request);
282 	blk_queue_physical_block_size(q, PAGE_SIZE);
283 	blk_queue_max_hw_sectors(q, UINT_MAX);
284 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
285 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
286 	q->queuedata = pmem;
287 
288 	disk = alloc_disk_node(0, nid);
289 	if (!disk)
290 		return -ENOMEM;
291 	if (devm_add_action(dev, pmem_release_disk, disk)) {
292 		put_disk(disk);
293 		return -ENOMEM;
294 	}
295 
296 	disk->fops		= &pmem_fops;
297 	disk->queue		= q;
298 	disk->flags		= GENHD_FL_EXT_DEVT;
299 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
300 	disk->driverfs_dev = dev;
301 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
302 			/ 512);
303 	if (devm_init_badblocks(dev, &pmem->bb))
304 		return -ENOMEM;
305 	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
306 	disk->bb = &pmem->bb;
307 	add_disk(disk);
308 	revalidate_disk(disk);
309 
310 	return 0;
311 }
312 
313 static int nd_pmem_probe(struct device *dev)
314 {
315 	struct nd_namespace_common *ndns;
316 
317 	ndns = nvdimm_namespace_common_probe(dev);
318 	if (IS_ERR(ndns))
319 		return PTR_ERR(ndns);
320 
321 	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
322 		return -ENXIO;
323 
324 	if (is_nd_btt(dev))
325 		return nvdimm_namespace_attach_btt(ndns);
326 
327 	if (is_nd_pfn(dev))
328 		return pmem_attach_disk(dev, ndns);
329 
330 	/* if we find a valid info-block we'll come back as that personality */
331 	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
332 			|| nd_dax_probe(dev, ndns) == 0)
333 		return -ENXIO;
334 
335 	/* ...otherwise we're just a raw pmem device */
336 	return pmem_attach_disk(dev, ndns);
337 }
338 
339 static int nd_pmem_remove(struct device *dev)
340 {
341 	if (is_nd_btt(dev))
342 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
343 	return 0;
344 }
345 
346 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
347 {
348 	struct nd_region *nd_region = to_nd_region(dev->parent);
349 	struct pmem_device *pmem = dev_get_drvdata(dev);
350 	resource_size_t offset = 0, end_trunc = 0;
351 	struct nd_namespace_common *ndns;
352 	struct nd_namespace_io *nsio;
353 	struct resource res;
354 
355 	if (event != NVDIMM_REVALIDATE_POISON)
356 		return;
357 
358 	if (is_nd_btt(dev)) {
359 		struct nd_btt *nd_btt = to_nd_btt(dev);
360 
361 		ndns = nd_btt->ndns;
362 	} else if (is_nd_pfn(dev)) {
363 		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
364 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
365 
366 		ndns = nd_pfn->ndns;
367 		offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad);
368 		end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
369 	} else
370 		ndns = to_ndns(dev);
371 
372 	nsio = to_nd_namespace_io(&ndns->dev);
373 	res.start = nsio->res.start + offset;
374 	res.end = nsio->res.end - end_trunc;
375 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &res);
376 }
377 
378 MODULE_ALIAS("pmem");
379 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
380 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
381 static struct nd_device_driver nd_pmem_driver = {
382 	.probe = nd_pmem_probe,
383 	.remove = nd_pmem_remove,
384 	.notify = nd_pmem_notify,
385 	.drv = {
386 		.name = "nd_pmem",
387 	},
388 	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
389 };
390 
391 static int __init pmem_init(void)
392 {
393 	return nd_driver_register(&nd_pmem_driver);
394 }
395 module_init(pmem_init);
396 
397 static void pmem_exit(void)
398 {
399 	driver_unregister(&nd_pmem_driver.drv);
400 }
401 module_exit(pmem_exit);
402 
403 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
404 MODULE_LICENSE("GPL v2");
405