1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Ram backed block device driver. 4 * 5 * Copyright (C) 2007 Nick Piggin 6 * Copyright (C) 2007 Novell Inc. 7 * 8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright 9 * of their respective owners. 10 */ 11 12 #include <linux/init.h> 13 #include <linux/initrd.h> 14 #include <linux/module.h> 15 #include <linux/moduleparam.h> 16 #include <linux/major.h> 17 #include <linux/blkdev.h> 18 #include <linux/bio.h> 19 #include <linux/highmem.h> 20 #include <linux/mutex.h> 21 #include <linux/pagemap.h> 22 #include <linux/xarray.h> 23 #include <linux/fs.h> 24 #include <linux/slab.h> 25 #include <linux/backing-dev.h> 26 #include <linux/debugfs.h> 27 28 #include <linux/uaccess.h> 29 30 /* 31 * Each block ramdisk device has a xarray brd_pages of pages that stores 32 * the pages containing the block device's contents. 33 */ 34 struct brd_device { 35 int brd_number; 36 struct gendisk *brd_disk; 37 struct list_head brd_list; 38 39 /* 40 * Backing store of pages. This is the contents of the block device. 41 */ 42 struct xarray brd_pages; 43 u64 brd_nr_pages; 44 }; 45 46 /* 47 * Look up and return a brd's page with reference grabbed for a given sector. 48 */ 49 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) 50 { 51 struct page *page; 52 XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); 53 54 rcu_read_lock(); 55 repeat: 56 page = xas_load(&xas); 57 if (xas_retry(&xas, page)) { 58 xas_reset(&xas); 59 goto repeat; 60 } 61 62 if (!page) 63 goto out; 64 65 if (!get_page_unless_zero(page)) { 66 xas_reset(&xas); 67 goto repeat; 68 } 69 70 if (unlikely(page != xas_reload(&xas))) { 71 put_page(page); 72 xas_reset(&xas); 73 goto repeat; 74 } 75 out: 76 rcu_read_unlock(); 77 78 return page; 79 } 80 81 /* 82 * Insert a new page for a given sector, if one does not already exist. 83 * The returned page will grab reference. 84 */ 85 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, 86 blk_opf_t opf) 87 { 88 gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; 89 struct page *page, *ret; 90 91 page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); 92 if (!page) 93 return ERR_PTR(-ENOMEM); 94 95 xa_lock(&brd->brd_pages); 96 ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, 97 page, gfp); 98 if (!ret) { 99 brd->brd_nr_pages++; 100 get_page(page); 101 xa_unlock(&brd->brd_pages); 102 return page; 103 } 104 105 if (!xa_is_err(ret)) { 106 get_page(ret); 107 xa_unlock(&brd->brd_pages); 108 put_page(page); 109 return ret; 110 } 111 112 xa_unlock(&brd->brd_pages); 113 put_page(page); 114 return ERR_PTR(xa_err(ret)); 115 } 116 117 /* 118 * Free all backing store pages and xarray. This must only be called when 119 * there are no other users of the device. 120 */ 121 static void brd_free_pages(struct brd_device *brd) 122 { 123 struct page *page; 124 pgoff_t idx; 125 126 xa_for_each(&brd->brd_pages, idx, page) { 127 put_page(page); 128 cond_resched(); 129 } 130 131 xa_destroy(&brd->brd_pages); 132 } 133 134 /* 135 * Process a single segment. The segment is capped to not cross page boundaries 136 * in both the bio and the brd backing memory. 137 */ 138 static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) 139 { 140 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 141 sector_t sector = bio->bi_iter.bi_sector; 142 u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT; 143 blk_opf_t opf = bio->bi_opf; 144 struct page *page; 145 void *kaddr; 146 147 bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); 148 149 page = brd_lookup_page(brd, sector); 150 if (!page && op_is_write(opf)) { 151 page = brd_insert_page(brd, sector, opf); 152 if (IS_ERR(page)) 153 goto out_error; 154 } 155 156 kaddr = bvec_kmap_local(&bv); 157 if (op_is_write(opf)) { 158 memcpy_to_page(page, offset, kaddr, bv.bv_len); 159 } else { 160 if (page) 161 memcpy_from_page(kaddr, page, offset, bv.bv_len); 162 else 163 memset(kaddr, 0, bv.bv_len); 164 } 165 kunmap_local(kaddr); 166 167 bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); 168 if (page) 169 put_page(page); 170 return true; 171 172 out_error: 173 if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) 174 bio_wouldblock_error(bio); 175 else 176 bio_io_error(bio); 177 return false; 178 } 179 180 static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) 181 { 182 sector_t aligned_sector = round_up(sector, PAGE_SECTORS); 183 sector_t aligned_end = round_down( 184 sector + (size >> SECTOR_SHIFT), PAGE_SECTORS); 185 struct page *page; 186 187 if (aligned_end <= aligned_sector) 188 return; 189 190 xa_lock(&brd->brd_pages); 191 while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { 192 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); 193 if (page) { 194 put_page(page); 195 brd->brd_nr_pages--; 196 } 197 aligned_sector += PAGE_SECTORS; 198 } 199 xa_unlock(&brd->brd_pages); 200 } 201 202 static void brd_submit_bio(struct bio *bio) 203 { 204 struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; 205 206 if (unlikely(op_is_discard(bio->bi_opf))) { 207 brd_do_discard(brd, bio->bi_iter.bi_sector, 208 bio->bi_iter.bi_size); 209 bio_endio(bio); 210 return; 211 } 212 213 do { 214 if (!brd_rw_bvec(brd, bio)) 215 return; 216 } while (bio->bi_iter.bi_size); 217 218 bio_endio(bio); 219 } 220 221 static const struct block_device_operations brd_fops = { 222 .owner = THIS_MODULE, 223 .submit_bio = brd_submit_bio, 224 }; 225 226 /* 227 * And now the modules code and kernel interface. 228 */ 229 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; 230 module_param(rd_nr, int, 0444); 231 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); 232 233 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE; 234 module_param(rd_size, ulong, 0444); 235 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); 236 237 static int max_part = 1; 238 module_param(max_part, int, 0444); 239 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); 240 241 MODULE_DESCRIPTION("Ram backed block device driver"); 242 MODULE_LICENSE("GPL"); 243 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); 244 MODULE_ALIAS("rd"); 245 246 #ifndef MODULE 247 /* Legacy boot options - nonmodular */ 248 static int __init ramdisk_size(char *str) 249 { 250 rd_size = simple_strtol(str, NULL, 0); 251 return 1; 252 } 253 __setup("ramdisk_size=", ramdisk_size); 254 #endif 255 256 /* 257 * The device scheme is derived from loop.c. Keep them in synch where possible 258 * (should share code eventually). 259 */ 260 static LIST_HEAD(brd_devices); 261 static DEFINE_MUTEX(brd_devices_mutex); 262 static struct dentry *brd_debugfs_dir; 263 264 static struct brd_device *brd_find_or_alloc_device(int i) 265 { 266 struct brd_device *brd; 267 268 mutex_lock(&brd_devices_mutex); 269 list_for_each_entry(brd, &brd_devices, brd_list) { 270 if (brd->brd_number == i) { 271 mutex_unlock(&brd_devices_mutex); 272 return ERR_PTR(-EEXIST); 273 } 274 } 275 276 brd = kzalloc(sizeof(*brd), GFP_KERNEL); 277 if (!brd) { 278 mutex_unlock(&brd_devices_mutex); 279 return ERR_PTR(-ENOMEM); 280 } 281 brd->brd_number = i; 282 list_add_tail(&brd->brd_list, &brd_devices); 283 mutex_unlock(&brd_devices_mutex); 284 return brd; 285 } 286 287 static void brd_free_device(struct brd_device *brd) 288 { 289 mutex_lock(&brd_devices_mutex); 290 list_del(&brd->brd_list); 291 mutex_unlock(&brd_devices_mutex); 292 kfree(brd); 293 } 294 295 static int brd_alloc(int i) 296 { 297 struct brd_device *brd; 298 struct gendisk *disk; 299 char buf[DISK_NAME_LEN]; 300 int err = -ENOMEM; 301 struct queue_limits lim = { 302 /* 303 * This is so fdisk will align partitions on 4k, because of 304 * direct_access API needing 4k alignment, returning a PFN 305 * (This is only a problem on very small devices <= 4M, 306 * otherwise fdisk will align on 1M. Regardless this call 307 * is harmless) 308 */ 309 .physical_block_size = PAGE_SIZE, 310 .max_hw_discard_sectors = UINT_MAX, 311 .max_discard_segments = 1, 312 .discard_granularity = PAGE_SIZE, 313 .features = BLK_FEAT_SYNCHRONOUS | 314 BLK_FEAT_NOWAIT, 315 }; 316 317 brd = brd_find_or_alloc_device(i); 318 if (IS_ERR(brd)) 319 return PTR_ERR(brd); 320 321 xa_init(&brd->brd_pages); 322 323 snprintf(buf, DISK_NAME_LEN, "ram%d", i); 324 if (!IS_ERR_OR_NULL(brd_debugfs_dir)) 325 debugfs_create_u64(buf, 0444, brd_debugfs_dir, 326 &brd->brd_nr_pages); 327 328 disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); 329 if (IS_ERR(disk)) { 330 err = PTR_ERR(disk); 331 goto out_free_dev; 332 } 333 disk->major = RAMDISK_MAJOR; 334 disk->first_minor = i * max_part; 335 disk->minors = max_part; 336 disk->fops = &brd_fops; 337 disk->private_data = brd; 338 strscpy(disk->disk_name, buf, DISK_NAME_LEN); 339 set_capacity(disk, rd_size * 2); 340 341 err = add_disk(disk); 342 if (err) 343 goto out_cleanup_disk; 344 345 return 0; 346 347 out_cleanup_disk: 348 put_disk(disk); 349 out_free_dev: 350 brd_free_device(brd); 351 return err; 352 } 353 354 static void brd_probe(dev_t dev) 355 { 356 brd_alloc(MINOR(dev) / max_part); 357 } 358 359 static void brd_cleanup(void) 360 { 361 struct brd_device *brd, *next; 362 363 debugfs_remove_recursive(brd_debugfs_dir); 364 365 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { 366 del_gendisk(brd->brd_disk); 367 put_disk(brd->brd_disk); 368 brd_free_pages(brd); 369 brd_free_device(brd); 370 } 371 } 372 373 static inline void brd_check_and_reset_par(void) 374 { 375 if (unlikely(!max_part)) 376 max_part = 1; 377 378 /* 379 * make sure 'max_part' can be divided exactly by (1U << MINORBITS), 380 * otherwise, it is possiable to get same dev_t when adding partitions. 381 */ 382 if ((1U << MINORBITS) % max_part != 0) 383 max_part = 1UL << fls(max_part); 384 385 if (max_part > DISK_MAX_PARTS) { 386 pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n", 387 DISK_MAX_PARTS, DISK_MAX_PARTS); 388 max_part = DISK_MAX_PARTS; 389 } 390 } 391 392 static int __init brd_init(void) 393 { 394 int err, i; 395 396 /* 397 * brd module now has a feature to instantiate underlying device 398 * structure on-demand, provided that there is an access dev node. 399 * 400 * (1) if rd_nr is specified, create that many upfront. else 401 * it defaults to CONFIG_BLK_DEV_RAM_COUNT 402 * (2) User can further extend brd devices by create dev node themselves 403 * and have kernel automatically instantiate actual device 404 * on-demand. Example: 405 * mknod /path/devnod_name b 1 X # 1 is the rd major 406 * fdisk -l /path/devnod_name 407 * If (X / max_part) was not already created it will be created 408 * dynamically. 409 */ 410 411 brd_check_and_reset_par(); 412 413 brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); 414 415 if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { 416 err = -EIO; 417 goto out_free; 418 } 419 420 for (i = 0; i < rd_nr; i++) 421 brd_alloc(i); 422 423 pr_info("brd: module loaded\n"); 424 return 0; 425 426 out_free: 427 brd_cleanup(); 428 429 pr_info("brd: module NOT loaded !!!\n"); 430 return err; 431 } 432 433 static void __exit brd_exit(void) 434 { 435 436 unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); 437 brd_cleanup(); 438 439 pr_info("brd: module unloaded\n"); 440 } 441 442 module_init(brd_init); 443 module_exit(brd_exit); 444 445