xref: /linux/block/genhd.c (revision 18f0817d2e9af479a40a1be4d83a849894d6b3f8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  gendisk handling
4  *
5  * Portions Copyright (C) 2020 Christoph Hellwig
6  */
7 
8 #include <linux/module.h>
9 #include <linux/ctype.h>
10 #include <linux/fs.h>
11 #include <linux/kdev_t.h>
12 #include <linux/kernel.h>
13 #include <linux/blkdev.h>
14 #include <linux/backing-dev.h>
15 #include <linux/init.h>
16 #include <linux/spinlock.h>
17 #include <linux/proc_fs.h>
18 #include <linux/seq_file.h>
19 #include <linux/slab.h>
20 #include <linux/kmod.h>
21 #include <linux/major.h>
22 #include <linux/mutex.h>
23 #include <linux/idr.h>
24 #include <linux/log2.h>
25 #include <linux/pm_runtime.h>
26 #include <linux/badblocks.h>
27 #include <linux/part_stat.h>
28 #include <linux/blktrace_api.h>
29 
30 #include "blk-throttle.h"
31 #include "blk.h"
32 #include "blk-mq-sched.h"
33 #include "blk-rq-qos.h"
34 #include "blk-cgroup.h"
35 
36 static struct kobject *block_depr;
37 
38 /*
39  * Unique, monotonically increasing sequential number associated with block
40  * devices instances (i.e. incremented each time a device is attached).
41  * Associating uevents with block devices in userspace is difficult and racy:
42  * the uevent netlink socket is lossy, and on slow and overloaded systems has
43  * a very high latency.
44  * Block devices do not have exclusive owners in userspace, any process can set
45  * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
46  * can be reused again and again).
47  * A userspace process setting up a block device and watching for its events
48  * cannot thus reliably tell whether an event relates to the device it just set
49  * up or another earlier instance with the same name.
50  * This sequential number allows userspace processes to solve this problem, and
51  * uniquely associate an uevent to the lifetime to a device.
52  */
53 static atomic64_t diskseq;
54 
55 /* for extended dynamic devt allocation, currently only one major is used */
56 #define NR_EXT_DEVT		(1 << MINORBITS)
57 static DEFINE_IDA(ext_devt_ida);
58 
59 void set_capacity(struct gendisk *disk, sector_t sectors)
60 {
61 	if (sectors > BLK_DEV_MAX_SECTORS) {
62 		pr_warn_once("%s: truncate capacity from %lld to %lld\n",
63 				disk->disk_name, sectors,
64 				BLK_DEV_MAX_SECTORS);
65 		sectors = BLK_DEV_MAX_SECTORS;
66 	}
67 
68 	bdev_set_nr_sectors(disk->part0, sectors);
69 }
70 EXPORT_SYMBOL(set_capacity);
71 
72 /*
73  * Set disk capacity and notify if the size is not currently zero and will not
74  * be set to zero.  Returns true if a uevent was sent, otherwise false.
75  */
76 bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
77 {
78 	sector_t capacity = get_capacity(disk);
79 	char *envp[] = { "RESIZE=1", NULL };
80 
81 	set_capacity(disk, size);
82 
83 	/*
84 	 * Only print a message and send a uevent if the gendisk is user visible
85 	 * and alive.  This avoids spamming the log and udev when setting the
86 	 * initial capacity during probing.
87 	 */
88 	if (size == capacity ||
89 	    !disk_live(disk) ||
90 	    (disk->flags & GENHD_FL_HIDDEN))
91 		return false;
92 
93 	pr_info("%s: detected capacity change from %lld to %lld\n",
94 		disk->disk_name, capacity, size);
95 
96 	/*
97 	 * Historically we did not send a uevent for changes to/from an empty
98 	 * device.
99 	 */
100 	if (!capacity || !size)
101 		return false;
102 	kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
103 	return true;
104 }
105 EXPORT_SYMBOL_GPL(set_capacity_and_notify);
106 
107 static void part_stat_read_all(struct block_device *part,
108 		struct disk_stats *stat)
109 {
110 	int cpu;
111 
112 	memset(stat, 0, sizeof(struct disk_stats));
113 	for_each_possible_cpu(cpu) {
114 		struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
115 		int group;
116 
117 		for (group = 0; group < NR_STAT_GROUPS; group++) {
118 			stat->nsecs[group] += ptr->nsecs[group];
119 			stat->sectors[group] += ptr->sectors[group];
120 			stat->ios[group] += ptr->ios[group];
121 			stat->merges[group] += ptr->merges[group];
122 		}
123 
124 		stat->io_ticks += ptr->io_ticks;
125 	}
126 }
127 
128 static void bdev_count_inflight_rw(struct block_device *part,
129 		unsigned int inflight[2], bool mq_driver)
130 {
131 	int cpu;
132 
133 	if (mq_driver) {
134 		blk_mq_in_driver_rw(part, inflight);
135 	} else {
136 		for_each_possible_cpu(cpu) {
137 			inflight[READ] += part_stat_local_read_cpu(
138 						part, in_flight[READ], cpu);
139 			inflight[WRITE] += part_stat_local_read_cpu(
140 						part, in_flight[WRITE], cpu);
141 		}
142 	}
143 
144 	if (WARN_ON_ONCE((int)inflight[READ] < 0))
145 		inflight[READ] = 0;
146 	if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
147 		inflight[WRITE] = 0;
148 }
149 
150 /**
151  * bdev_count_inflight - get the number of inflight IOs for a block device.
152  *
153  * @part: the block device.
154  *
155  * Inflight here means started IO accounting, from bdev_start_io_acct() for
156  * bio-based block device, and from blk_account_io_start() for rq-based block
157  * device.
158  */
159 unsigned int bdev_count_inflight(struct block_device *part)
160 {
161 	unsigned int inflight[2] = {0};
162 
163 	bdev_count_inflight_rw(part, inflight, false);
164 
165 	return inflight[READ] + inflight[WRITE];
166 }
167 EXPORT_SYMBOL_GPL(bdev_count_inflight);
168 
169 /*
170  * Can be deleted altogether. Later.
171  *
172  */
173 #define BLKDEV_MAJOR_HASH_SIZE 255
174 static struct blk_major_name {
175 	struct blk_major_name *next;
176 	int major;
177 	char name[16];
178 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
179 	void (*probe)(dev_t devt);
180 #endif
181 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
182 static DEFINE_MUTEX(major_names_lock);
183 static DEFINE_SPINLOCK(major_names_spinlock);
184 
185 /* index in the above - for now: assume no multimajor ranges */
186 static inline int major_to_index(unsigned major)
187 {
188 	return major % BLKDEV_MAJOR_HASH_SIZE;
189 }
190 
191 #ifdef CONFIG_PROC_FS
192 void blkdev_show(struct seq_file *seqf, off_t offset)
193 {
194 	struct blk_major_name *dp;
195 
196 	spin_lock(&major_names_spinlock);
197 	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
198 		if (dp->major == offset)
199 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
200 	spin_unlock(&major_names_spinlock);
201 }
202 #endif /* CONFIG_PROC_FS */
203 
204 /**
205  * __register_blkdev - register a new block device
206  *
207  * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
208  *         @major = 0, try to allocate any unused major number.
209  * @name: the name of the new block device as a zero terminated string
210  * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
211  *	   pre-created device node is accessed. When a probe call uses
212  *	   add_disk() and it fails the driver must cleanup resources. This
213  *	   interface may soon be removed.
214  *
215  * The @name must be unique within the system.
216  *
217  * The return value depends on the @major input parameter:
218  *
219  *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
220  *    then the function returns zero on success, or a negative error code
221  *  - if any unused major number was requested with @major = 0 parameter
222  *    then the return value is the allocated major number in range
223  *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
224  *
225  * See Documentation/admin-guide/devices.txt for the list of allocated
226  * major numbers.
227  *
228  * Use register_blkdev instead for any new code.
229  */
230 int __register_blkdev(unsigned int major, const char *name,
231 		void (*probe)(dev_t devt))
232 {
233 	struct blk_major_name **n, *p;
234 	int index, ret = 0;
235 
236 	mutex_lock(&major_names_lock);
237 
238 	/* temporary */
239 	if (major == 0) {
240 		for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
241 			if (major_names[index] == NULL)
242 				break;
243 		}
244 
245 		if (index == 0) {
246 			printk("%s: failed to get major for %s\n",
247 			       __func__, name);
248 			ret = -EBUSY;
249 			goto out;
250 		}
251 		major = index;
252 		ret = major;
253 	}
254 
255 	if (major >= BLKDEV_MAJOR_MAX) {
256 		pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
257 		       __func__, major, BLKDEV_MAJOR_MAX-1, name);
258 
259 		ret = -EINVAL;
260 		goto out;
261 	}
262 
263 	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
264 	if (p == NULL) {
265 		ret = -ENOMEM;
266 		goto out;
267 	}
268 
269 	p->major = major;
270 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
271 	p->probe = probe;
272 #endif
273 	strscpy(p->name, name, sizeof(p->name));
274 	p->next = NULL;
275 	index = major_to_index(major);
276 
277 	spin_lock(&major_names_spinlock);
278 	for (n = &major_names[index]; *n; n = &(*n)->next) {
279 		if ((*n)->major == major)
280 			break;
281 	}
282 	if (!*n)
283 		*n = p;
284 	else
285 		ret = -EBUSY;
286 	spin_unlock(&major_names_spinlock);
287 
288 	if (ret < 0) {
289 		printk("register_blkdev: cannot get major %u for %s\n",
290 		       major, name);
291 		kfree(p);
292 	}
293 out:
294 	mutex_unlock(&major_names_lock);
295 	return ret;
296 }
297 EXPORT_SYMBOL(__register_blkdev);
298 
299 void unregister_blkdev(unsigned int major, const char *name)
300 {
301 	struct blk_major_name **n;
302 	struct blk_major_name *p = NULL;
303 	int index = major_to_index(major);
304 
305 	mutex_lock(&major_names_lock);
306 	spin_lock(&major_names_spinlock);
307 	for (n = &major_names[index]; *n; n = &(*n)->next)
308 		if ((*n)->major == major)
309 			break;
310 	if (!*n || strcmp((*n)->name, name)) {
311 		WARN_ON(1);
312 	} else {
313 		p = *n;
314 		*n = p->next;
315 	}
316 	spin_unlock(&major_names_spinlock);
317 	mutex_unlock(&major_names_lock);
318 	kfree(p);
319 }
320 
321 EXPORT_SYMBOL(unregister_blkdev);
322 
323 int blk_alloc_ext_minor(void)
324 {
325 	int idx;
326 
327 	idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
328 	if (idx == -ENOSPC)
329 		return -EBUSY;
330 	return idx;
331 }
332 
333 void blk_free_ext_minor(unsigned int minor)
334 {
335 	ida_free(&ext_devt_ida, minor);
336 }
337 
338 void disk_uevent(struct gendisk *disk, enum kobject_action action)
339 {
340 	struct block_device *part;
341 	unsigned long idx;
342 
343 	rcu_read_lock();
344 	xa_for_each(&disk->part_tbl, idx, part) {
345 		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
346 			continue;
347 		if (!kobject_get_unless_zero(&part->bd_device.kobj))
348 			continue;
349 
350 		rcu_read_unlock();
351 		kobject_uevent(bdev_kobj(part), action);
352 		put_device(&part->bd_device);
353 		rcu_read_lock();
354 	}
355 	rcu_read_unlock();
356 }
357 EXPORT_SYMBOL_GPL(disk_uevent);
358 
359 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
360 {
361 	struct file *file;
362 	int ret = 0;
363 
364 	if (!disk_has_partscan(disk))
365 		return -EINVAL;
366 	if (disk->open_partitions)
367 		return -EBUSY;
368 
369 	/*
370 	 * If the device is opened exclusively by current thread already, it's
371 	 * safe to scan partitons, otherwise, use bd_prepare_to_claim() to
372 	 * synchronize with other exclusive openers and other partition
373 	 * scanners.
374 	 */
375 	if (!(mode & BLK_OPEN_EXCL)) {
376 		ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions,
377 					  NULL);
378 		if (ret)
379 			return ret;
380 	}
381 
382 	set_bit(GD_NEED_PART_SCAN, &disk->state);
383 	file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
384 				     NULL, NULL);
385 	if (IS_ERR(file))
386 		ret = PTR_ERR(file);
387 	else
388 		fput(file);
389 
390 	/*
391 	 * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
392 	 * and this will cause that re-assemble partitioned raid device will
393 	 * creat partition for underlying disk.
394 	 */
395 	clear_bit(GD_NEED_PART_SCAN, &disk->state);
396 	if (!(mode & BLK_OPEN_EXCL))
397 		bd_abort_claiming(disk->part0, disk_scan_partitions);
398 	return ret;
399 }
400 
401 static void add_disk_final(struct gendisk *disk)
402 {
403 	struct device *ddev = disk_to_dev(disk);
404 
405 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
406 		/* Make sure the first partition scan will be proceed */
407 		if (get_capacity(disk) && disk_has_partscan(disk))
408 			set_bit(GD_NEED_PART_SCAN, &disk->state);
409 
410 		bdev_add(disk->part0, ddev->devt);
411 		if (get_capacity(disk))
412 			disk_scan_partitions(disk, BLK_OPEN_READ);
413 
414 		/*
415 		 * Announce the disk and partitions after all partitions are
416 		 * created. (for hidden disks uevents remain suppressed forever)
417 		 */
418 		dev_set_uevent_suppress(ddev, 0);
419 		disk_uevent(disk, KOBJ_ADD);
420 	}
421 
422 	blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
423 	disk_add_events(disk);
424 	set_bit(GD_ADDED, &disk->state);
425 }
426 
427 static int __add_disk(struct device *parent, struct gendisk *disk,
428 		      const struct attribute_group **groups,
429 		      struct fwnode_handle *fwnode)
430 
431 {
432 	struct device *ddev = disk_to_dev(disk);
433 	int ret;
434 
435 	if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS))
436 		return -EINVAL;
437 
438 	if (queue_is_mq(disk->queue)) {
439 		/*
440 		 * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
441 		 */
442 		if (disk->fops->submit_bio || disk->fops->poll_bio)
443 			return -EINVAL;
444 	} else {
445 		if (!disk->fops->submit_bio)
446 			return -EINVAL;
447 		bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
448 	}
449 
450 	/*
451 	 * If the driver provides an explicit major number it also must provide
452 	 * the number of minors numbers supported, and those will be used to
453 	 * setup the gendisk.
454 	 * Otherwise just allocate the device numbers for both the whole device
455 	 * and all partitions from the extended dev_t space.
456 	 */
457 	ret = -EINVAL;
458 	if (disk->major) {
459 		if (WARN_ON(!disk->minors))
460 			goto out;
461 
462 		if (disk->minors > DISK_MAX_PARTS) {
463 			pr_err("block: can't allocate more than %d partitions\n",
464 				DISK_MAX_PARTS);
465 			disk->minors = DISK_MAX_PARTS;
466 		}
467 		if (disk->first_minor > MINORMASK ||
468 		    disk->minors > MINORMASK + 1 ||
469 		    disk->first_minor + disk->minors > MINORMASK + 1)
470 			goto out;
471 	} else {
472 		if (WARN_ON(disk->minors))
473 			goto out;
474 
475 		ret = blk_alloc_ext_minor();
476 		if (ret < 0)
477 			goto out;
478 		disk->major = BLOCK_EXT_MAJOR;
479 		disk->first_minor = ret;
480 	}
481 
482 	/* delay uevents, until we scanned partition table */
483 	dev_set_uevent_suppress(ddev, 1);
484 
485 	ddev->parent = parent;
486 	ddev->groups = groups;
487 	dev_set_name(ddev, "%s", disk->disk_name);
488 	if (fwnode)
489 		device_set_node(ddev, fwnode);
490 	if (!(disk->flags & GENHD_FL_HIDDEN))
491 		ddev->devt = MKDEV(disk->major, disk->first_minor);
492 	ret = device_add(ddev);
493 	if (ret)
494 		goto out_free_ext_minor;
495 
496 	ret = disk_alloc_events(disk);
497 	if (ret)
498 		goto out_device_del;
499 
500 	ret = sysfs_create_link(block_depr, &ddev->kobj,
501 				kobject_name(&ddev->kobj));
502 	if (ret)
503 		goto out_device_del;
504 
505 	/*
506 	 * avoid probable deadlock caused by allocating memory with
507 	 * GFP_KERNEL in runtime_resume callback of its all ancestor
508 	 * devices
509 	 */
510 	pm_runtime_set_memalloc_noio(ddev, true);
511 
512 	disk->part0->bd_holder_dir =
513 		kobject_create_and_add("holders", &ddev->kobj);
514 	if (!disk->part0->bd_holder_dir) {
515 		ret = -ENOMEM;
516 		goto out_del_block_link;
517 	}
518 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
519 	if (!disk->slave_dir) {
520 		ret = -ENOMEM;
521 		goto out_put_holder_dir;
522 	}
523 
524 	ret = blk_register_queue(disk);
525 	if (ret)
526 		goto out_put_slave_dir;
527 
528 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
529 		ret = bdi_register(disk->bdi, "%u:%u",
530 				   disk->major, disk->first_minor);
531 		if (ret)
532 			goto out_unregister_queue;
533 		bdi_set_owner(disk->bdi, ddev);
534 		ret = sysfs_create_link(&ddev->kobj,
535 					&disk->bdi->dev->kobj, "bdi");
536 		if (ret)
537 			goto out_unregister_bdi;
538 	} else {
539 		/*
540 		 * Even if the block_device for a hidden gendisk is not
541 		 * registered, it needs to have a valid bd_dev so that the
542 		 * freeing of the dynamic major works.
543 		 */
544 		disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
545 	}
546 	return 0;
547 
548 out_unregister_bdi:
549 	if (!(disk->flags & GENHD_FL_HIDDEN))
550 		bdi_unregister(disk->bdi);
551 out_unregister_queue:
552 	blk_unregister_queue(disk);
553 	rq_qos_exit(disk->queue);
554 out_put_slave_dir:
555 	kobject_put(disk->slave_dir);
556 	disk->slave_dir = NULL;
557 out_put_holder_dir:
558 	kobject_put(disk->part0->bd_holder_dir);
559 out_del_block_link:
560 	sysfs_remove_link(block_depr, dev_name(ddev));
561 	pm_runtime_set_memalloc_noio(ddev, false);
562 out_device_del:
563 	device_del(ddev);
564 out_free_ext_minor:
565 	if (disk->major == BLOCK_EXT_MAJOR)
566 		blk_free_ext_minor(disk->first_minor);
567 out:
568 	return ret;
569 }
570 
571 /**
572  * add_disk_fwnode - add disk information to kernel list with fwnode
573  * @parent: parent device for the disk
574  * @disk: per-device partitioning information
575  * @groups: Additional per-device sysfs groups
576  * @fwnode: attached disk fwnode
577  *
578  * This function registers the partitioning information in @disk
579  * with the kernel. Also attach a fwnode to the disk device.
580  */
581 int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
582 				 const struct attribute_group **groups,
583 				 struct fwnode_handle *fwnode)
584 {
585 	struct blk_mq_tag_set *set;
586 	unsigned int memflags;
587 	int ret;
588 
589 	if (queue_is_mq(disk->queue)) {
590 		set = disk->queue->tag_set;
591 		memflags = memalloc_noio_save();
592 		down_read(&set->update_nr_hwq_lock);
593 		ret = __add_disk(parent, disk, groups, fwnode);
594 		up_read(&set->update_nr_hwq_lock);
595 		memalloc_noio_restore(memflags);
596 	} else {
597 		ret = __add_disk(parent, disk, groups, fwnode);
598 	}
599 
600 	/*
601 	 * add_disk_final() needn't to read `nr_hw_queues`, so move it out
602 	 * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
603 	 * lock dependency on `disk->open_mutex` from scanning partition.
604 	 */
605 	if (!ret)
606 		add_disk_final(disk);
607 	return ret;
608 }
609 EXPORT_SYMBOL_GPL(add_disk_fwnode);
610 
611 /**
612  * device_add_disk - add disk information to kernel list
613  * @parent: parent device for the disk
614  * @disk: per-device partitioning information
615  * @groups: Additional per-device sysfs groups
616  *
617  * This function registers the partitioning information in @disk
618  * with the kernel.
619  */
620 int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
621 				 const struct attribute_group **groups)
622 {
623 	return add_disk_fwnode(parent, disk, groups, NULL);
624 }
625 EXPORT_SYMBOL(device_add_disk);
626 
627 static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
628 {
629 	struct block_device *bdev;
630 	unsigned long idx;
631 
632 	/*
633 	 * On surprise disk removal, bdev_mark_dead() may call into file
634 	 * systems below. Make it clear that we're expecting to not hold
635 	 * disk->open_mutex.
636 	 */
637 	lockdep_assert_not_held(&disk->open_mutex);
638 
639 	rcu_read_lock();
640 	xa_for_each(&disk->part_tbl, idx, bdev) {
641 		if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
642 			continue;
643 		rcu_read_unlock();
644 
645 		bdev_mark_dead(bdev, surprise);
646 
647 		put_device(&bdev->bd_device);
648 		rcu_read_lock();
649 	}
650 	rcu_read_unlock();
651 }
652 
653 static bool __blk_mark_disk_dead(struct gendisk *disk)
654 {
655 	/*
656 	 * Fail any new I/O.
657 	 */
658 	if (test_and_set_bit(GD_DEAD, &disk->state))
659 		return false;
660 
661 	if (test_bit(GD_OWNS_QUEUE, &disk->state))
662 		blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
663 
664 	/*
665 	 * Stop buffered writers from dirtying pages that can't be written out.
666 	 */
667 	set_capacity(disk, 0);
668 
669 	/*
670 	 * Prevent new I/O from crossing bio_queue_enter().
671 	 */
672 	return blk_queue_start_drain(disk->queue);
673 }
674 
675 /**
676  * blk_mark_disk_dead - mark a disk as dead
677  * @disk: disk to mark as dead
678  *
679  * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
680  * to this disk.
681  */
682 void blk_mark_disk_dead(struct gendisk *disk)
683 {
684 	__blk_mark_disk_dead(disk);
685 	blk_report_disk_dead(disk, true);
686 }
687 EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
688 
689 static void __del_gendisk(struct gendisk *disk)
690 {
691 	struct request_queue *q = disk->queue;
692 	struct block_device *part;
693 	unsigned long idx;
694 	bool start_drain;
695 
696 	might_sleep();
697 
698 	if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
699 		return;
700 
701 	disk_del_events(disk);
702 
703 	/*
704 	 * Prevent new openers by unlinked the bdev inode.
705 	 */
706 	mutex_lock(&disk->open_mutex);
707 	xa_for_each(&disk->part_tbl, idx, part)
708 		bdev_unhash(part);
709 	mutex_unlock(&disk->open_mutex);
710 
711 	/*
712 	 * Tell the file system to write back all dirty data and shut down if
713 	 * it hasn't been notified earlier.
714 	 */
715 	if (!test_bit(GD_DEAD, &disk->state))
716 		blk_report_disk_dead(disk, false);
717 
718 	/*
719 	 * Drop all partitions now that the disk is marked dead.
720 	 */
721 	mutex_lock(&disk->open_mutex);
722 	start_drain = __blk_mark_disk_dead(disk);
723 	if (start_drain)
724 		blk_freeze_acquire_lock(q);
725 	xa_for_each_start(&disk->part_tbl, idx, part, 1)
726 		drop_partition(part);
727 	mutex_unlock(&disk->open_mutex);
728 
729 	if (!(disk->flags & GENHD_FL_HIDDEN)) {
730 		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
731 
732 		/*
733 		 * Unregister bdi before releasing device numbers (as they can
734 		 * get reused and we'd get clashes in sysfs).
735 		 */
736 		bdi_unregister(disk->bdi);
737 	}
738 
739 	blk_unregister_queue(disk);
740 
741 	kobject_put(disk->part0->bd_holder_dir);
742 	kobject_put(disk->slave_dir);
743 	disk->slave_dir = NULL;
744 
745 	part_stat_set_all(disk->part0, 0);
746 	disk->part0->bd_stamp = 0;
747 	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
748 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
749 	device_del(disk_to_dev(disk));
750 
751 	blk_mq_freeze_queue_wait(q);
752 
753 	blk_throtl_cancel_bios(disk);
754 
755 	blk_sync_queue(q);
756 	blk_flush_integrity();
757 
758 	if (queue_is_mq(q))
759 		blk_mq_cancel_work_sync(q);
760 
761 	rq_qos_exit(q);
762 
763 	/*
764 	 * If the disk does not own the queue, allow using passthrough requests
765 	 * again.  Else leave the queue frozen to fail all I/O.
766 	 */
767 	if (!test_bit(GD_OWNS_QUEUE, &disk->state))
768 		__blk_mq_unfreeze_queue(q, true);
769 	else if (queue_is_mq(q))
770 		blk_mq_exit_queue(q);
771 
772 	if (start_drain)
773 		blk_unfreeze_release_lock(q);
774 }
775 
776 static void disable_elv_switch(struct request_queue *q)
777 {
778 	struct blk_mq_tag_set *set = q->tag_set;
779 	WARN_ON_ONCE(!queue_is_mq(q));
780 
781 	down_write(&set->update_nr_hwq_lock);
782 	blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
783 	up_write(&set->update_nr_hwq_lock);
784 }
785 
786 /**
787  * del_gendisk - remove the gendisk
788  * @disk: the struct gendisk to remove
789  *
790  * Removes the gendisk and all its associated resources. This deletes the
791  * partitions associated with the gendisk, and unregisters the associated
792  * request_queue.
793  *
794  * This is the counter to the respective __device_add_disk() call.
795  *
796  * The final removal of the struct gendisk happens when its refcount reaches 0
797  * with put_disk(), which should be called after del_gendisk(), if
798  * __device_add_disk() was used.
799  *
800  * Drivers exist which depend on the release of the gendisk to be synchronous,
801  * it should not be deferred.
802  *
803  * Context: can sleep
804  */
805 void del_gendisk(struct gendisk *disk)
806 {
807 	struct blk_mq_tag_set *set;
808 	unsigned int memflags;
809 
810 	if (!queue_is_mq(disk->queue)) {
811 		__del_gendisk(disk);
812 	} else {
813 		set = disk->queue->tag_set;
814 
815 		disable_elv_switch(disk->queue);
816 
817 		memflags = memalloc_noio_save();
818 		down_read(&set->update_nr_hwq_lock);
819 		__del_gendisk(disk);
820 		up_read(&set->update_nr_hwq_lock);
821 		memalloc_noio_restore(memflags);
822 	}
823 }
824 EXPORT_SYMBOL(del_gendisk);
825 
826 /**
827  * invalidate_disk - invalidate the disk
828  * @disk: the struct gendisk to invalidate
829  *
830  * A helper to invalidates the disk. It will clean the disk's associated
831  * buffer/page caches and reset its internal states so that the disk
832  * can be reused by the drivers.
833  *
834  * Context: can sleep
835  */
836 void invalidate_disk(struct gendisk *disk)
837 {
838 	struct block_device *bdev = disk->part0;
839 
840 	invalidate_bdev(bdev);
841 	bdev->bd_mapping->wb_err = 0;
842 	set_capacity(disk, 0);
843 }
844 EXPORT_SYMBOL(invalidate_disk);
845 
846 /* sysfs access to bad-blocks list. */
847 static ssize_t disk_badblocks_show(struct device *dev,
848 					struct device_attribute *attr,
849 					char *page)
850 {
851 	struct gendisk *disk = dev_to_disk(dev);
852 
853 	if (!disk->bb)
854 		return sysfs_emit(page, "\n");
855 
856 	return badblocks_show(disk->bb, page, 0);
857 }
858 
859 static ssize_t disk_badblocks_store(struct device *dev,
860 					struct device_attribute *attr,
861 					const char *page, size_t len)
862 {
863 	struct gendisk *disk = dev_to_disk(dev);
864 
865 	if (!disk->bb)
866 		return -ENXIO;
867 
868 	return badblocks_store(disk->bb, page, len, 0);
869 }
870 
871 #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
872 static bool blk_probe_dev(dev_t devt)
873 {
874 	unsigned int major = MAJOR(devt);
875 	struct blk_major_name **n;
876 
877 	mutex_lock(&major_names_lock);
878 	for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
879 		if ((*n)->major == major && (*n)->probe) {
880 			(*n)->probe(devt);
881 			mutex_unlock(&major_names_lock);
882 			return true;
883 		}
884 	}
885 	mutex_unlock(&major_names_lock);
886 	return false;
887 }
888 
889 void blk_request_module(dev_t devt)
890 {
891 	int error;
892 
893 	if (blk_probe_dev(devt))
894 		return;
895 
896 	error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
897 	/* Make old-style 2.4 aliases work */
898 	if (error > 0)
899 		error = request_module("block-major-%d", MAJOR(devt));
900 	if (!error)
901 		blk_probe_dev(devt);
902 }
903 #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
904 
905 #ifdef CONFIG_PROC_FS
906 /* iterator */
907 static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
908 {
909 	loff_t skip = *pos;
910 	struct class_dev_iter *iter;
911 	struct device *dev;
912 
913 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
914 	if (!iter)
915 		return ERR_PTR(-ENOMEM);
916 
917 	seqf->private = iter;
918 	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
919 	do {
920 		dev = class_dev_iter_next(iter);
921 		if (!dev)
922 			return NULL;
923 	} while (skip--);
924 
925 	return dev_to_disk(dev);
926 }
927 
928 static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
929 {
930 	struct device *dev;
931 
932 	(*pos)++;
933 	dev = class_dev_iter_next(seqf->private);
934 	if (dev)
935 		return dev_to_disk(dev);
936 
937 	return NULL;
938 }
939 
940 static void disk_seqf_stop(struct seq_file *seqf, void *v)
941 {
942 	struct class_dev_iter *iter = seqf->private;
943 
944 	/* stop is called even after start failed :-( */
945 	if (iter) {
946 		class_dev_iter_exit(iter);
947 		kfree(iter);
948 		seqf->private = NULL;
949 	}
950 }
951 
952 static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
953 {
954 	void *p;
955 
956 	p = disk_seqf_start(seqf, pos);
957 	if (!IS_ERR_OR_NULL(p) && !*pos)
958 		seq_puts(seqf, "major minor  #blocks  name\n\n");
959 	return p;
960 }
961 
962 static int show_partition(struct seq_file *seqf, void *v)
963 {
964 	struct gendisk *sgp = v;
965 	struct block_device *part;
966 	unsigned long idx;
967 
968 	if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
969 		return 0;
970 
971 	rcu_read_lock();
972 	xa_for_each(&sgp->part_tbl, idx, part) {
973 		if (!bdev_nr_sectors(part))
974 			continue;
975 		seq_printf(seqf, "%4d  %7d %10llu %pg\n",
976 			   MAJOR(part->bd_dev), MINOR(part->bd_dev),
977 			   bdev_nr_sectors(part) >> 1, part);
978 	}
979 	rcu_read_unlock();
980 	return 0;
981 }
982 
983 static const struct seq_operations partitions_op = {
984 	.start	= show_partition_start,
985 	.next	= disk_seqf_next,
986 	.stop	= disk_seqf_stop,
987 	.show	= show_partition
988 };
989 #endif
990 
991 static int __init genhd_device_init(void)
992 {
993 	int error;
994 
995 	error = class_register(&block_class);
996 	if (unlikely(error))
997 		return error;
998 	blk_dev_init();
999 
1000 	register_blkdev(BLOCK_EXT_MAJOR, "blkext");
1001 
1002 	/* create top-level block dir */
1003 	block_depr = kobject_create_and_add("block", NULL);
1004 	return 0;
1005 }
1006 
1007 subsys_initcall(genhd_device_init);
1008 
1009 static ssize_t disk_range_show(struct device *dev,
1010 			       struct device_attribute *attr, char *buf)
1011 {
1012 	struct gendisk *disk = dev_to_disk(dev);
1013 
1014 	return sysfs_emit(buf, "%d\n", disk->minors);
1015 }
1016 
1017 static ssize_t disk_ext_range_show(struct device *dev,
1018 				   struct device_attribute *attr, char *buf)
1019 {
1020 	struct gendisk *disk = dev_to_disk(dev);
1021 
1022 	return sysfs_emit(buf, "%d\n",
1023 		(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
1024 }
1025 
1026 static ssize_t disk_removable_show(struct device *dev,
1027 				   struct device_attribute *attr, char *buf)
1028 {
1029 	struct gendisk *disk = dev_to_disk(dev);
1030 
1031 	return sysfs_emit(buf, "%d\n",
1032 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
1033 }
1034 
1035 static ssize_t disk_hidden_show(struct device *dev,
1036 				   struct device_attribute *attr, char *buf)
1037 {
1038 	struct gendisk *disk = dev_to_disk(dev);
1039 
1040 	return sysfs_emit(buf, "%d\n",
1041 		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
1042 }
1043 
1044 static ssize_t disk_ro_show(struct device *dev,
1045 				   struct device_attribute *attr, char *buf)
1046 {
1047 	struct gendisk *disk = dev_to_disk(dev);
1048 
1049 	return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
1050 }
1051 
1052 ssize_t part_size_show(struct device *dev,
1053 		       struct device_attribute *attr, char *buf)
1054 {
1055 	return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
1056 }
1057 
1058 ssize_t part_stat_show(struct device *dev,
1059 		       struct device_attribute *attr, char *buf)
1060 {
1061 	struct block_device *bdev = dev_to_bdev(dev);
1062 	struct disk_stats stat;
1063 	unsigned int inflight;
1064 
1065 	inflight = bdev_count_inflight(bdev);
1066 	if (inflight) {
1067 		part_stat_lock();
1068 		update_io_ticks(bdev, jiffies, true);
1069 		part_stat_unlock();
1070 	}
1071 	part_stat_read_all(bdev, &stat);
1072 	return sysfs_emit(buf,
1073 		"%8lu %8lu %8llu %8u "
1074 		"%8lu %8lu %8llu %8u "
1075 		"%8u %8u %8u "
1076 		"%8lu %8lu %8llu %8u "
1077 		"%8lu %8u"
1078 		"\n",
1079 		stat.ios[STAT_READ],
1080 		stat.merges[STAT_READ],
1081 		(unsigned long long)stat.sectors[STAT_READ],
1082 		(unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
1083 		stat.ios[STAT_WRITE],
1084 		stat.merges[STAT_WRITE],
1085 		(unsigned long long)stat.sectors[STAT_WRITE],
1086 		(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
1087 		inflight,
1088 		jiffies_to_msecs(stat.io_ticks),
1089 		(unsigned int)div_u64(stat.nsecs[STAT_READ] +
1090 				      stat.nsecs[STAT_WRITE] +
1091 				      stat.nsecs[STAT_DISCARD] +
1092 				      stat.nsecs[STAT_FLUSH],
1093 						NSEC_PER_MSEC),
1094 		stat.ios[STAT_DISCARD],
1095 		stat.merges[STAT_DISCARD],
1096 		(unsigned long long)stat.sectors[STAT_DISCARD],
1097 		(unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
1098 		stat.ios[STAT_FLUSH],
1099 		(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
1100 }
1101 
1102 /*
1103  * Show the number of IOs issued to driver.
1104  * For bio-based device, started from bdev_start_io_acct();
1105  * For rq-based device, started from blk_mq_start_request();
1106  */
1107 ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
1108 			   char *buf)
1109 {
1110 	struct block_device *bdev = dev_to_bdev(dev);
1111 	struct request_queue *q = bdev_get_queue(bdev);
1112 	unsigned int inflight[2] = {0};
1113 
1114 	bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
1115 
1116 	return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
1117 }
1118 
1119 static ssize_t disk_capability_show(struct device *dev,
1120 				    struct device_attribute *attr, char *buf)
1121 {
1122 	dev_warn_once(dev, "the capability attribute has been deprecated.\n");
1123 	return sysfs_emit(buf, "0\n");
1124 }
1125 
1126 static ssize_t disk_alignment_offset_show(struct device *dev,
1127 					  struct device_attribute *attr,
1128 					  char *buf)
1129 {
1130 	struct gendisk *disk = dev_to_disk(dev);
1131 
1132 	return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
1133 }
1134 
1135 static ssize_t disk_discard_alignment_show(struct device *dev,
1136 					   struct device_attribute *attr,
1137 					   char *buf)
1138 {
1139 	struct gendisk *disk = dev_to_disk(dev);
1140 
1141 	return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
1142 }
1143 
1144 static ssize_t diskseq_show(struct device *dev,
1145 			    struct device_attribute *attr, char *buf)
1146 {
1147 	struct gendisk *disk = dev_to_disk(dev);
1148 
1149 	return sysfs_emit(buf, "%llu\n", disk->diskseq);
1150 }
1151 
1152 static ssize_t partscan_show(struct device *dev,
1153 		struct device_attribute *attr, char *buf)
1154 {
1155 	return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
1156 }
1157 
1158 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
1159 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
1160 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
1161 static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
1162 static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
1163 static DEVICE_ATTR(size, 0444, part_size_show, NULL);
1164 static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
1165 static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
1166 static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
1167 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
1168 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
1169 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
1170 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
1171 static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
1172 
1173 #ifdef CONFIG_FAIL_MAKE_REQUEST
1174 ssize_t part_fail_show(struct device *dev,
1175 		       struct device_attribute *attr, char *buf)
1176 {
1177 	return sysfs_emit(buf, "%d\n",
1178 		       bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
1179 }
1180 
1181 ssize_t part_fail_store(struct device *dev,
1182 			struct device_attribute *attr,
1183 			const char *buf, size_t count)
1184 {
1185 	int i;
1186 
1187 	if (count > 0 && sscanf(buf, "%d", &i) > 0) {
1188 		if (i)
1189 			bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
1190 		else
1191 			bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
1192 	}
1193 	return count;
1194 }
1195 
1196 static struct device_attribute dev_attr_fail =
1197 	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
1198 #endif /* CONFIG_FAIL_MAKE_REQUEST */
1199 
1200 #ifdef CONFIG_FAIL_IO_TIMEOUT
1201 static struct device_attribute dev_attr_fail_timeout =
1202 	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
1203 #endif
1204 
1205 static struct attribute *disk_attrs[] = {
1206 	&dev_attr_range.attr,
1207 	&dev_attr_ext_range.attr,
1208 	&dev_attr_removable.attr,
1209 	&dev_attr_hidden.attr,
1210 	&dev_attr_ro.attr,
1211 	&dev_attr_size.attr,
1212 	&dev_attr_alignment_offset.attr,
1213 	&dev_attr_discard_alignment.attr,
1214 	&dev_attr_capability.attr,
1215 	&dev_attr_stat.attr,
1216 	&dev_attr_inflight.attr,
1217 	&dev_attr_badblocks.attr,
1218 	&dev_attr_events.attr,
1219 	&dev_attr_events_async.attr,
1220 	&dev_attr_events_poll_msecs.attr,
1221 	&dev_attr_diskseq.attr,
1222 	&dev_attr_partscan.attr,
1223 #ifdef CONFIG_FAIL_MAKE_REQUEST
1224 	&dev_attr_fail.attr,
1225 #endif
1226 #ifdef CONFIG_FAIL_IO_TIMEOUT
1227 	&dev_attr_fail_timeout.attr,
1228 #endif
1229 	NULL
1230 };
1231 
1232 static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
1233 {
1234 	struct device *dev = container_of(kobj, typeof(*dev), kobj);
1235 	struct gendisk *disk = dev_to_disk(dev);
1236 
1237 	if (a == &dev_attr_badblocks.attr && !disk->bb)
1238 		return 0;
1239 	return a->mode;
1240 }
1241 
1242 static struct attribute_group disk_attr_group = {
1243 	.attrs = disk_attrs,
1244 	.is_visible = disk_visible,
1245 };
1246 
1247 static const struct attribute_group *disk_attr_groups[] = {
1248 	&disk_attr_group,
1249 #ifdef CONFIG_BLK_DEV_IO_TRACE
1250 	&blk_trace_attr_group,
1251 #endif
1252 #ifdef CONFIG_BLK_DEV_INTEGRITY
1253 	&blk_integrity_attr_group,
1254 #endif
1255 	NULL
1256 };
1257 
1258 /**
1259  * disk_release - releases all allocated resources of the gendisk
1260  * @dev: the device representing this disk
1261  *
1262  * This function releases all allocated resources of the gendisk.
1263  *
1264  * Drivers which used __device_add_disk() have a gendisk with a request_queue
1265  * assigned. Since the request_queue sits on top of the gendisk for these
1266  * drivers we also call blk_put_queue() for them, and we expect the
1267  * request_queue refcount to reach 0 at this point, and so the request_queue
1268  * will also be freed prior to the disk.
1269  *
1270  * Context: can sleep
1271  */
1272 static void disk_release(struct device *dev)
1273 {
1274 	struct gendisk *disk = dev_to_disk(dev);
1275 
1276 	might_sleep();
1277 	WARN_ON_ONCE(disk_live(disk));
1278 
1279 	blk_trace_remove(disk->queue);
1280 
1281 	/*
1282 	 * To undo the all initialization from blk_mq_init_allocated_queue in
1283 	 * case of a probe failure where add_disk is never called we have to
1284 	 * call blk_mq_exit_queue here. We can't do this for the more common
1285 	 * teardown case (yet) as the tagset can be gone by the time the disk
1286 	 * is released once it was added.
1287 	 */
1288 	if (queue_is_mq(disk->queue) &&
1289 	    test_bit(GD_OWNS_QUEUE, &disk->state) &&
1290 	    !test_bit(GD_ADDED, &disk->state))
1291 		blk_mq_exit_queue(disk->queue);
1292 
1293 	blkcg_exit_disk(disk);
1294 
1295 	bioset_exit(&disk->bio_split);
1296 
1297 	disk_release_events(disk);
1298 	kfree(disk->random);
1299 	disk_free_zone_resources(disk);
1300 	xa_destroy(&disk->part_tbl);
1301 
1302 	disk->queue->disk = NULL;
1303 	blk_put_queue(disk->queue);
1304 
1305 	if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
1306 		disk->fops->free_disk(disk);
1307 
1308 	bdev_drop(disk->part0);	/* frees the disk */
1309 }
1310 
1311 static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
1312 {
1313 	const struct gendisk *disk = dev_to_disk(dev);
1314 
1315 	return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
1316 }
1317 
1318 const struct class block_class = {
1319 	.name		= "block",
1320 	.dev_uevent	= block_uevent,
1321 };
1322 
1323 static char *block_devnode(const struct device *dev, umode_t *mode,
1324 			   kuid_t *uid, kgid_t *gid)
1325 {
1326 	struct gendisk *disk = dev_to_disk(dev);
1327 
1328 	if (disk->fops->devnode)
1329 		return disk->fops->devnode(disk, mode);
1330 	return NULL;
1331 }
1332 
1333 const struct device_type disk_type = {
1334 	.name		= "disk",
1335 	.groups		= disk_attr_groups,
1336 	.release	= disk_release,
1337 	.devnode	= block_devnode,
1338 };
1339 
1340 #ifdef CONFIG_PROC_FS
1341 /*
1342  * aggregate disk stat collector.  Uses the same stats that the sysfs
1343  * entries do, above, but makes them available through one seq_file.
1344  *
1345  * The output looks suspiciously like /proc/partitions with a bunch of
1346  * extra fields.
1347  */
1348 static int diskstats_show(struct seq_file *seqf, void *v)
1349 {
1350 	struct gendisk *gp = v;
1351 	struct block_device *hd;
1352 	unsigned int inflight;
1353 	struct disk_stats stat;
1354 	unsigned long idx;
1355 
1356 	/*
1357 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1358 		seq_puts(seqf,	"major minor name"
1359 				"     rio rmerge rsect ruse wio wmerge "
1360 				"wsect wuse running use aveq"
1361 				"\n\n");
1362 	*/
1363 
1364 	rcu_read_lock();
1365 	xa_for_each(&gp->part_tbl, idx, hd) {
1366 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
1367 			continue;
1368 
1369 		inflight = bdev_count_inflight(hd);
1370 		if (inflight) {
1371 			part_stat_lock();
1372 			update_io_ticks(hd, jiffies, true);
1373 			part_stat_unlock();
1374 		}
1375 		part_stat_read_all(hd, &stat);
1376 		seq_put_decimal_ull_width(seqf, "",  MAJOR(hd->bd_dev), 4);
1377 		seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
1378 		seq_printf(seqf, " %pg", hd);
1379 		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
1380 		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
1381 		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
1382 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
1383 								     NSEC_PER_MSEC));
1384 		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
1385 		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
1386 		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
1387 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
1388 								     NSEC_PER_MSEC));
1389 		seq_put_decimal_ull(seqf, " ", inflight);
1390 		seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
1391 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
1392 								     stat.nsecs[STAT_WRITE] +
1393 								     stat.nsecs[STAT_DISCARD] +
1394 								     stat.nsecs[STAT_FLUSH],
1395 								     NSEC_PER_MSEC));
1396 		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
1397 		seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
1398 		seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
1399 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
1400 								     NSEC_PER_MSEC));
1401 		seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
1402 		seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
1403 								     NSEC_PER_MSEC));
1404 		seq_putc(seqf, '\n');
1405 	}
1406 	rcu_read_unlock();
1407 
1408 	return 0;
1409 }
1410 
1411 static const struct seq_operations diskstats_op = {
1412 	.start	= disk_seqf_start,
1413 	.next	= disk_seqf_next,
1414 	.stop	= disk_seqf_stop,
1415 	.show	= diskstats_show
1416 };
1417 
1418 static int __init proc_genhd_init(void)
1419 {
1420 	proc_create_seq("diskstats", 0, NULL, &diskstats_op);
1421 	proc_create_seq("partitions", 0, NULL, &partitions_op);
1422 	return 0;
1423 }
1424 module_init(proc_genhd_init);
1425 #endif /* CONFIG_PROC_FS */
1426 
1427 dev_t part_devt(struct gendisk *disk, u8 partno)
1428 {
1429 	struct block_device *part;
1430 	dev_t devt = 0;
1431 
1432 	rcu_read_lock();
1433 	part = xa_load(&disk->part_tbl, partno);
1434 	if (part)
1435 		devt = part->bd_dev;
1436 	rcu_read_unlock();
1437 
1438 	return devt;
1439 }
1440 
1441 struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
1442 		struct lock_class_key *lkclass)
1443 {
1444 	struct gendisk *disk;
1445 
1446 	disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
1447 	if (!disk)
1448 		return NULL;
1449 
1450 	if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
1451 		goto out_free_disk;
1452 
1453 	disk->bdi = bdi_alloc(node_id);
1454 	if (!disk->bdi)
1455 		goto out_free_bioset;
1456 
1457 	/* bdev_alloc() might need the queue, set before the first call */
1458 	disk->queue = q;
1459 
1460 	disk->part0 = bdev_alloc(disk, 0);
1461 	if (!disk->part0)
1462 		goto out_free_bdi;
1463 
1464 	disk->node_id = node_id;
1465 	mutex_init(&disk->open_mutex);
1466 	xa_init(&disk->part_tbl);
1467 	if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
1468 		goto out_destroy_part_tbl;
1469 
1470 	if (blkcg_init_disk(disk))
1471 		goto out_erase_part0;
1472 
1473 	disk_init_zone_resources(disk);
1474 	rand_initialize_disk(disk);
1475 	disk_to_dev(disk)->class = &block_class;
1476 	disk_to_dev(disk)->type = &disk_type;
1477 	device_initialize(disk_to_dev(disk));
1478 	inc_diskseq(disk);
1479 	q->disk = disk;
1480 	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
1481 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
1482 	INIT_LIST_HEAD(&disk->slave_bdevs);
1483 #endif
1484 	mutex_init(&disk->rqos_state_mutex);
1485 	return disk;
1486 
1487 out_erase_part0:
1488 	xa_erase(&disk->part_tbl, 0);
1489 out_destroy_part_tbl:
1490 	xa_destroy(&disk->part_tbl);
1491 	disk->part0->bd_disk = NULL;
1492 	bdev_drop(disk->part0);
1493 out_free_bdi:
1494 	bdi_put(disk->bdi);
1495 out_free_bioset:
1496 	bioset_exit(&disk->bio_split);
1497 out_free_disk:
1498 	kfree(disk);
1499 	return NULL;
1500 }
1501 
1502 struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
1503 		struct lock_class_key *lkclass)
1504 {
1505 	struct queue_limits default_lim = { };
1506 	struct request_queue *q;
1507 	struct gendisk *disk;
1508 
1509 	q = blk_alloc_queue(lim ? lim : &default_lim, node);
1510 	if (IS_ERR(q))
1511 		return ERR_CAST(q);
1512 
1513 	disk = __alloc_disk_node(q, node, lkclass);
1514 	if (!disk) {
1515 		blk_put_queue(q);
1516 		return ERR_PTR(-ENOMEM);
1517 	}
1518 	set_bit(GD_OWNS_QUEUE, &disk->state);
1519 	return disk;
1520 }
1521 EXPORT_SYMBOL(__blk_alloc_disk);
1522 
1523 /**
1524  * put_disk - decrements the gendisk refcount
1525  * @disk: the struct gendisk to decrement the refcount for
1526  *
1527  * This decrements the refcount for the struct gendisk. When this reaches 0
1528  * we'll have disk_release() called.
1529  *
1530  * Note: for blk-mq disk put_disk must be called before freeing the tag_set
1531  * when handling probe errors (that is before add_disk() is called).
1532  *
1533  * Context: Any context, but the last reference must not be dropped from
1534  *          atomic context.
1535  */
1536 void put_disk(struct gendisk *disk)
1537 {
1538 	if (disk)
1539 		put_device(disk_to_dev(disk));
1540 }
1541 EXPORT_SYMBOL(put_disk);
1542 
1543 static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1544 {
1545 	char event[] = "DISK_RO=1";
1546 	char *envp[] = { event, NULL };
1547 
1548 	if (!ro)
1549 		event[8] = '0';
1550 	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1551 }
1552 
1553 /**
1554  * set_disk_ro - set a gendisk read-only
1555  * @disk:	gendisk to operate on
1556  * @read_only:	%true to set the disk read-only, %false set the disk read/write
1557  *
1558  * This function is used to indicate whether a given disk device should have its
1559  * read-only flag set. set_disk_ro() is typically used by device drivers to
1560  * indicate whether the underlying physical device is write-protected.
1561  */
1562 void set_disk_ro(struct gendisk *disk, bool read_only)
1563 {
1564 	if (read_only) {
1565 		if (test_and_set_bit(GD_READ_ONLY, &disk->state))
1566 			return;
1567 	} else {
1568 		if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
1569 			return;
1570 	}
1571 	set_disk_ro_uevent(disk, read_only);
1572 }
1573 EXPORT_SYMBOL(set_disk_ro);
1574 
1575 void inc_diskseq(struct gendisk *disk)
1576 {
1577 	disk->diskseq = atomic64_inc_return(&diskseq);
1578 }
1579