1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Ram backed block device driver.
4 *
5 * Copyright (C) 2007 Nick Piggin
6 * Copyright (C) 2007 Novell Inc.
7 *
8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
9 * of their respective owners.
10 */
11
12 #include <linux/init.h>
13 #include <linux/initrd.h>
14 #include <linux/module.h>
15 #include <linux/moduleparam.h>
16 #include <linux/major.h>
17 #include <linux/blkdev.h>
18 #include <linux/bio.h>
19 #include <linux/highmem.h>
20 #include <linux/mutex.h>
21 #include <linux/pagemap.h>
22 #include <linux/xarray.h>
23 #include <linux/fs.h>
24 #include <linux/slab.h>
25 #include <linux/backing-dev.h>
26 #include <linux/debugfs.h>
27
28 #include <linux/uaccess.h>
29
30 /*
31 * Each block ramdisk device has a xarray brd_pages of pages that stores
32 * the pages containing the block device's contents.
33 */
34 struct brd_device {
35 int brd_number;
36 struct gendisk *brd_disk;
37 struct list_head brd_list;
38
39 /*
40 * Backing store of pages. This is the contents of the block device.
41 */
42 struct xarray brd_pages;
43 u64 brd_nr_pages;
44 };
45
46 /*
47 * Look up and return a brd's page with reference grabbed for a given sector.
48 */
brd_lookup_page(struct brd_device * brd,sector_t sector)49 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
50 {
51 struct page *page;
52 XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
53
54 rcu_read_lock();
55 repeat:
56 page = xas_load(&xas);
57 if (xas_retry(&xas, page)) {
58 xas_reset(&xas);
59 goto repeat;
60 }
61
62 if (!page)
63 goto out;
64
65 if (!get_page_unless_zero(page)) {
66 xas_reset(&xas);
67 goto repeat;
68 }
69
70 if (unlikely(page != xas_reload(&xas))) {
71 put_page(page);
72 xas_reset(&xas);
73 goto repeat;
74 }
75 out:
76 rcu_read_unlock();
77
78 return page;
79 }
80
81 /*
82 * Insert a new page for a given sector, if one does not already exist.
83 * The returned page will grab reference.
84 */
brd_insert_page(struct brd_device * brd,sector_t sector,blk_opf_t opf)85 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
86 blk_opf_t opf)
87 {
88 gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
89 struct page *page, *ret;
90
91 page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
92 if (!page)
93 return ERR_PTR(-ENOMEM);
94
95 xa_lock(&brd->brd_pages);
96 ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
97 page, gfp);
98 if (!ret) {
99 brd->brd_nr_pages++;
100 get_page(page);
101 xa_unlock(&brd->brd_pages);
102 return page;
103 }
104
105 if (!xa_is_err(ret)) {
106 get_page(ret);
107 xa_unlock(&brd->brd_pages);
108 put_page(page);
109 return ret;
110 }
111
112 xa_unlock(&brd->brd_pages);
113 put_page(page);
114 return ERR_PTR(xa_err(ret));
115 }
116
117 /*
118 * Free all backing store pages and xarray. This must only be called when
119 * there are no other users of the device.
120 */
brd_free_pages(struct brd_device * brd)121 static void brd_free_pages(struct brd_device *brd)
122 {
123 struct page *page;
124 pgoff_t idx;
125
126 xa_for_each(&brd->brd_pages, idx, page) {
127 put_page(page);
128 cond_resched();
129 }
130
131 xa_destroy(&brd->brd_pages);
132 }
133
134 /*
135 * Process a single segment. The segment is capped to not cross page boundaries
136 * in both the bio and the brd backing memory.
137 */
brd_rw_bvec(struct brd_device * brd,struct bio * bio)138 static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
139 {
140 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
141 sector_t sector = bio->bi_iter.bi_sector;
142 u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
143 blk_opf_t opf = bio->bi_opf;
144 struct page *page;
145 void *kaddr;
146
147 bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
148
149 page = brd_lookup_page(brd, sector);
150 if (!page && op_is_write(opf)) {
151 page = brd_insert_page(brd, sector, opf);
152 if (IS_ERR(page))
153 goto out_error;
154 }
155
156 kaddr = bvec_kmap_local(&bv);
157 if (op_is_write(opf)) {
158 memcpy_to_page(page, offset, kaddr, bv.bv_len);
159 } else {
160 if (page)
161 memcpy_from_page(kaddr, page, offset, bv.bv_len);
162 else
163 memset(kaddr, 0, bv.bv_len);
164 }
165 kunmap_local(kaddr);
166
167 bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
168 if (page)
169 put_page(page);
170 return true;
171
172 out_error:
173 if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
174 bio_wouldblock_error(bio);
175 else
176 bio_io_error(bio);
177 return false;
178 }
179
brd_do_discard(struct brd_device * brd,sector_t sector,u32 size)180 static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
181 {
182 sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
183 sector_t aligned_end = round_down(
184 sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
185 struct page *page;
186
187 if (aligned_end <= aligned_sector)
188 return;
189
190 xa_lock(&brd->brd_pages);
191 while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
192 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
193 if (page) {
194 put_page(page);
195 brd->brd_nr_pages--;
196 }
197 aligned_sector += PAGE_SECTORS;
198 }
199 xa_unlock(&brd->brd_pages);
200 }
201
brd_submit_bio(struct bio * bio)202 static void brd_submit_bio(struct bio *bio)
203 {
204 struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
205
206 if (unlikely(op_is_discard(bio->bi_opf))) {
207 brd_do_discard(brd, bio->bi_iter.bi_sector,
208 bio->bi_iter.bi_size);
209 bio_endio(bio);
210 return;
211 }
212
213 do {
214 if (!brd_rw_bvec(brd, bio))
215 return;
216 } while (bio->bi_iter.bi_size);
217
218 bio_endio(bio);
219 }
220
221 static const struct block_device_operations brd_fops = {
222 .owner = THIS_MODULE,
223 .submit_bio = brd_submit_bio,
224 };
225
226 /*
227 * And now the modules code and kernel interface.
228 */
229 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
230 module_param(rd_nr, int, 0444);
231 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
232
233 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
234 module_param(rd_size, ulong, 0444);
235 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
236
237 static int max_part = 1;
238 module_param(max_part, int, 0444);
239 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
240
241 MODULE_DESCRIPTION("Ram backed block device driver");
242 MODULE_LICENSE("GPL");
243 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
244 MODULE_ALIAS("rd");
245
246 #ifndef MODULE
247 /* Legacy boot options - nonmodular */
ramdisk_size(char * str)248 static int __init ramdisk_size(char *str)
249 {
250 return kstrtoul(str, 0, &rd_size) == 0;
251 }
252 __setup("ramdisk_size=", ramdisk_size);
253 #endif
254
255 /*
256 * The device scheme is derived from loop.c. Keep them in synch where possible
257 * (should share code eventually).
258 */
259 static LIST_HEAD(brd_devices);
260 static DEFINE_MUTEX(brd_devices_mutex);
261 static struct dentry *brd_debugfs_dir;
262
brd_find_or_alloc_device(int i)263 static struct brd_device *brd_find_or_alloc_device(int i)
264 {
265 struct brd_device *brd;
266
267 mutex_lock(&brd_devices_mutex);
268 list_for_each_entry(brd, &brd_devices, brd_list) {
269 if (brd->brd_number == i) {
270 mutex_unlock(&brd_devices_mutex);
271 return ERR_PTR(-EEXIST);
272 }
273 }
274
275 brd = kzalloc_obj(*brd);
276 if (!brd) {
277 mutex_unlock(&brd_devices_mutex);
278 return ERR_PTR(-ENOMEM);
279 }
280 brd->brd_number = i;
281 list_add_tail(&brd->brd_list, &brd_devices);
282 mutex_unlock(&brd_devices_mutex);
283 return brd;
284 }
285
brd_free_device(struct brd_device * brd)286 static void brd_free_device(struct brd_device *brd)
287 {
288 mutex_lock(&brd_devices_mutex);
289 list_del(&brd->brd_list);
290 mutex_unlock(&brd_devices_mutex);
291 kfree(brd);
292 }
293
brd_alloc(int i)294 static int brd_alloc(int i)
295 {
296 struct brd_device *brd;
297 struct gendisk *disk;
298 char buf[DISK_NAME_LEN];
299 int err = -ENOMEM;
300 struct queue_limits lim = {
301 /*
302 * This is so fdisk will align partitions on 4k, because of
303 * direct_access API needing 4k alignment, returning a PFN
304 * (This is only a problem on very small devices <= 4M,
305 * otherwise fdisk will align on 1M. Regardless this call
306 * is harmless)
307 */
308 .physical_block_size = PAGE_SIZE,
309 .max_hw_discard_sectors = UINT_MAX,
310 .max_discard_segments = 1,
311 .discard_granularity = PAGE_SIZE,
312 .features = BLK_FEAT_SYNCHRONOUS |
313 BLK_FEAT_NOWAIT,
314 };
315
316 brd = brd_find_or_alloc_device(i);
317 if (IS_ERR(brd))
318 return PTR_ERR(brd);
319
320 xa_init(&brd->brd_pages);
321
322 snprintf(buf, DISK_NAME_LEN, "ram%d", i);
323 if (!IS_ERR_OR_NULL(brd_debugfs_dir))
324 debugfs_create_u64(buf, 0444, brd_debugfs_dir,
325 &brd->brd_nr_pages);
326
327 disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
328 if (IS_ERR(disk)) {
329 err = PTR_ERR(disk);
330 goto out_free_dev;
331 }
332 disk->major = RAMDISK_MAJOR;
333 disk->first_minor = i * max_part;
334 disk->minors = max_part;
335 disk->fops = &brd_fops;
336 disk->private_data = brd;
337 strscpy(disk->disk_name, buf, DISK_NAME_LEN);
338 set_capacity(disk, rd_size * 2);
339
340 err = add_disk(disk);
341 if (err)
342 goto out_cleanup_disk;
343
344 return 0;
345
346 out_cleanup_disk:
347 put_disk(disk);
348 out_free_dev:
349 brd_free_device(brd);
350 return err;
351 }
352
brd_probe(dev_t dev)353 static void brd_probe(dev_t dev)
354 {
355 brd_alloc(MINOR(dev) / max_part);
356 }
357
brd_cleanup(void)358 static void brd_cleanup(void)
359 {
360 struct brd_device *brd, *next;
361
362 debugfs_remove_recursive(brd_debugfs_dir);
363
364 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
365 del_gendisk(brd->brd_disk);
366 put_disk(brd->brd_disk);
367 brd_free_pages(brd);
368 brd_free_device(brd);
369 }
370 }
371
brd_check_and_reset_par(void)372 static inline void brd_check_and_reset_par(void)
373 {
374 if (unlikely(!max_part))
375 max_part = 1;
376
377 /*
378 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
379 * otherwise, it is possiable to get same dev_t when adding partitions.
380 */
381 if ((1U << MINORBITS) % max_part != 0)
382 max_part = 1UL << fls(max_part);
383
384 if (max_part > DISK_MAX_PARTS) {
385 pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
386 DISK_MAX_PARTS, DISK_MAX_PARTS);
387 max_part = DISK_MAX_PARTS;
388 }
389 }
390
brd_init(void)391 static int __init brd_init(void)
392 {
393 int err, i;
394
395 /*
396 * brd module now has a feature to instantiate underlying device
397 * structure on-demand, provided that there is an access dev node.
398 *
399 * (1) if rd_nr is specified, create that many upfront. else
400 * it defaults to CONFIG_BLK_DEV_RAM_COUNT
401 * (2) User can further extend brd devices by create dev node themselves
402 * and have kernel automatically instantiate actual device
403 * on-demand. Example:
404 * mknod /path/devnod_name b 1 X # 1 is the rd major
405 * fdisk -l /path/devnod_name
406 * If (X / max_part) was not already created it will be created
407 * dynamically.
408 */
409
410 brd_check_and_reset_par();
411
412 brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
413
414 if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
415 err = -EIO;
416 goto out_free;
417 }
418
419 for (i = 0; i < rd_nr; i++)
420 brd_alloc(i);
421
422 pr_info("brd: module loaded\n");
423 return 0;
424
425 out_free:
426 brd_cleanup();
427
428 pr_info("brd: module NOT loaded !!!\n");
429 return err;
430 }
431
brd_exit(void)432 static void __exit brd_exit(void)
433 {
434
435 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
436 brd_cleanup();
437
438 pr_info("brd: module unloaded\n");
439 }
440
441 module_init(brd_init);
442 module_exit(brd_exit);
443
444