xref: /linux/drivers/nvme/host/multipath.c (revision 869567bcbe2dcc790860e05fc0e0c5e415bb22c2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2018 Christoph Hellwig.
4  */
5 
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11 
12 bool multipath = true;
13 static bool multipath_always_on;
14 
15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 	int ret;
18 	bool *arg = kp->arg;
19 
20 	ret = param_set_bool(val, kp);
21 	if (ret)
22 		return ret;
23 
24 	if (multipath_always_on && !*arg) {
25 		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 		*arg = true;
27 		return -EINVAL;
28 	}
29 
30 	return 0;
31 }
32 
33 static const struct kernel_param_ops multipath_param_ops = {
34 	.set = multipath_param_set,
35 	.get = param_get_bool,
36 };
37 
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 	"turn on native support for multiple controllers per subsystem");
41 
42 static int multipath_always_on_set(const char *val,
43 		const struct kernel_param *kp)
44 {
45 	int ret;
46 	bool *arg = kp->arg;
47 
48 	ret = param_set_bool(val, kp);
49 	if (ret < 0)
50 		return ret;
51 
52 	if (*arg)
53 		multipath = true;
54 
55 	return 0;
56 }
57 
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 	.set = multipath_always_on_set,
60 	.get = param_get_bool,
61 };
62 
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 		&multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67 
68 static const char *nvme_iopolicy_names[] = {
69 	[NVME_IOPOLICY_NUMA]	= "numa",
70 	[NVME_IOPOLICY_RR]	= "round-robin",
71 	[NVME_IOPOLICY_QD]      = "queue-depth",
72 };
73 
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75 
76 static int nvme_iopolicy_parse(const char *str)
77 {
78 	int i;
79 
80 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
81 		if (sysfs_streq(str, nvme_iopolicy_names[i]))
82 			return i;
83 	}
84 	return -EINVAL;
85 }
86 
87 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
88 {
89 	int policy;
90 
91 	if (!val)
92 		return -EINVAL;
93 
94 	policy = nvme_iopolicy_parse(val);
95 	if (policy < 0)
96 		return policy;
97 
98 	iopolicy = policy;
99 	return 0;
100 }
101 
102 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
103 {
104 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
105 }
106 
107 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
108 	&iopolicy, 0644);
109 MODULE_PARM_DESC(iopolicy,
110 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
111 
112 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
113 {
114 	subsys->iopolicy = iopolicy;
115 }
116 
117 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
118 {
119 	struct nvme_ns_head *h;
120 
121 	lockdep_assert_held(&subsys->lock);
122 	list_for_each_entry(h, &subsys->nsheads, entry)
123 		if (h->disk)
124 			blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
125 }
126 
127 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
128 {
129 	struct nvme_ns_head *h;
130 
131 	lockdep_assert_held(&subsys->lock);
132 	list_for_each_entry(h, &subsys->nsheads, entry)
133 		if (h->disk)
134 			blk_mq_freeze_queue_wait(h->disk->queue);
135 }
136 
137 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
138 {
139 	struct nvme_ns_head *h;
140 
141 	lockdep_assert_held(&subsys->lock);
142 	list_for_each_entry(h, &subsys->nsheads, entry)
143 		if (h->disk)
144 			blk_freeze_queue_start(h->disk->queue);
145 }
146 
147 void nvme_failover_req(struct request *req)
148 {
149 	struct nvme_ns *ns = req->q->queuedata;
150 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
151 	unsigned long flags;
152 	struct bio *bio;
153 
154 	nvme_mpath_clear_current_path(ns);
155 	atomic_long_inc(&ns->failover);
156 
157 	/*
158 	 * If we got back an ANA error, we know the controller is alive but not
159 	 * ready to serve this namespace.  Kick of a re-read of the ANA
160 	 * information page, and just try any other available path for now.
161 	 */
162 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
163 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
164 		queue_work(nvme_wq, &ns->ctrl->ana_work);
165 	}
166 
167 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
168 	for (bio = req->bio; bio; bio = bio->bi_next)
169 		bio_set_dev(bio, ns->head->disk->part0);
170 	blk_steal_bios(&ns->head->requeue_list, req);
171 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
172 
173 	nvme_req(req)->status = 0;
174 	nvme_end_req(req);
175 	kblockd_schedule_work(&ns->head->requeue_work);
176 }
177 
178 void nvme_mpath_start_request(struct request *rq)
179 {
180 	struct nvme_ns *ns = rq->q->queuedata;
181 	struct gendisk *disk = ns->head->disk;
182 
183 	if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
184 	    !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
185 		atomic_inc(&ns->ctrl->nr_active);
186 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
187 	}
188 
189 	if (!blk_queue_io_stat(disk->queue) ||
190 	    (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
191 		return;
192 	if (blk_rq_is_passthrough(rq) &&
193 	    !blk_rq_passthrough_stats(rq, disk->queue))
194 		return;
195 
196 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
197 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
198 						      jiffies);
199 }
200 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
201 
202 void nvme_mpath_end_request(struct request *rq)
203 {
204 	struct nvme_ns *ns = rq->q->queuedata;
205 
206 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
207 		atomic_dec_if_positive(&ns->ctrl->nr_active);
208 
209 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
210 		return;
211 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
212 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
213 			 nvme_req(rq)->start_time);
214 }
215 
216 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
217 {
218 	struct nvme_ns *ns;
219 	int srcu_idx;
220 
221 	srcu_idx = srcu_read_lock(&ctrl->srcu);
222 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
223 				 srcu_read_lock_held(&ctrl->srcu)) {
224 		if (!ns->head->disk)
225 			continue;
226 		kblockd_schedule_work(&ns->head->requeue_work);
227 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
228 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
229 	}
230 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
231 }
232 
233 static const char *nvme_ana_state_names[] = {
234 	[0]				= "invalid state",
235 	[NVME_ANA_OPTIMIZED]		= "optimized",
236 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
237 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
238 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
239 	[NVME_ANA_CHANGE]		= "change",
240 };
241 
242 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
243 {
244 	struct nvme_ns_head *head = ns->head;
245 	bool changed = false;
246 	int node;
247 
248 	for_each_node(node) {
249 		if (ns == rcu_access_pointer(head->current_path[node])) {
250 			rcu_assign_pointer(head->current_path[node], NULL);
251 			changed = true;
252 		}
253 	}
254 	return changed;
255 }
256 
257 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
258 {
259 	struct nvme_ns *ns;
260 	int srcu_idx;
261 
262 	srcu_idx = srcu_read_lock(&ctrl->srcu);
263 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
264 				 srcu_read_lock_held(&ctrl->srcu)) {
265 		nvme_mpath_clear_current_path(ns);
266 		kblockd_schedule_work(&ns->head->requeue_work);
267 	}
268 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
269 }
270 
271 void nvme_mpath_revalidate_paths(struct nvme_ns_head *head)
272 {
273 	sector_t capacity = get_capacity(head->disk);
274 	struct nvme_ns *ns;
275 	int node;
276 	int srcu_idx;
277 
278 	srcu_idx = srcu_read_lock(&head->srcu);
279 	list_for_each_entry_srcu(ns, &head->list, siblings,
280 				 srcu_read_lock_held(&head->srcu)) {
281 		if (capacity != get_capacity(ns->disk))
282 			clear_bit(NVME_NS_READY, &ns->flags);
283 	}
284 	srcu_read_unlock(&head->srcu, srcu_idx);
285 
286 	for_each_node(node)
287 		rcu_assign_pointer(head->current_path[node], NULL);
288 	kblockd_schedule_work(&head->requeue_work);
289 }
290 
291 static bool nvme_path_is_disabled(struct nvme_ns *ns)
292 {
293 	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
294 
295 	/*
296 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
297 	 * still be able to complete assuming that the controller is connected.
298 	 * Otherwise it will fail immediately and return to the requeue list.
299 	 */
300 	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
301 		return true;
302 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
303 	    !test_bit(NVME_NS_READY, &ns->flags))
304 		return true;
305 	return false;
306 }
307 
308 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
309 {
310 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
311 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
312 
313 	list_for_each_entry_srcu(ns, &head->list, siblings,
314 				 srcu_read_lock_held(&head->srcu)) {
315 		if (nvme_path_is_disabled(ns))
316 			continue;
317 
318 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
319 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
320 			distance = node_distance(node, ns->ctrl->numa_node);
321 		else
322 			distance = LOCAL_DISTANCE;
323 
324 		switch (ns->ana_state) {
325 		case NVME_ANA_OPTIMIZED:
326 			if (distance < found_distance) {
327 				found_distance = distance;
328 				found = ns;
329 			}
330 			break;
331 		case NVME_ANA_NONOPTIMIZED:
332 			if (distance < fallback_distance) {
333 				fallback_distance = distance;
334 				fallback = ns;
335 			}
336 			break;
337 		default:
338 			break;
339 		}
340 	}
341 
342 	if (!found)
343 		found = fallback;
344 	if (found)
345 		rcu_assign_pointer(head->current_path[node], found);
346 	return found;
347 }
348 
349 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
350 		struct nvme_ns *ns)
351 {
352 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
353 			siblings);
354 	if (ns)
355 		return ns;
356 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
357 }
358 
359 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
360 {
361 	struct nvme_ns *ns, *found = NULL;
362 	int node = numa_node_id();
363 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
364 					       &head->srcu);
365 
366 	if (unlikely(!old))
367 		return __nvme_find_path(head, node);
368 
369 	if (list_is_singular(&head->list)) {
370 		if (nvme_path_is_disabled(old))
371 			return NULL;
372 		return old;
373 	}
374 
375 	for (ns = nvme_next_ns(head, old);
376 	     ns && ns != old;
377 	     ns = nvme_next_ns(head, ns)) {
378 		if (nvme_path_is_disabled(ns))
379 			continue;
380 
381 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
382 			found = ns;
383 			goto out;
384 		}
385 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
386 			found = ns;
387 	}
388 
389 	/*
390 	 * The loop above skips the current path for round-robin semantics.
391 	 * Fall back to the current path if either:
392 	 *  - no other optimized path found and current is optimized,
393 	 *  - no other usable path found and current is usable.
394 	 */
395 	if (!nvme_path_is_disabled(old) &&
396 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
397 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
398 		return old;
399 
400 	if (!found)
401 		return NULL;
402 out:
403 	rcu_assign_pointer(head->current_path[node], found);
404 	return found;
405 }
406 
407 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
408 {
409 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
410 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
411 	unsigned int depth;
412 
413 	list_for_each_entry_srcu(ns, &head->list, siblings,
414 				 srcu_read_lock_held(&head->srcu)) {
415 		if (nvme_path_is_disabled(ns))
416 			continue;
417 
418 		depth = atomic_read(&ns->ctrl->nr_active);
419 
420 		switch (ns->ana_state) {
421 		case NVME_ANA_OPTIMIZED:
422 			if (depth < min_depth_opt) {
423 				min_depth_opt = depth;
424 				best_opt = ns;
425 			}
426 			break;
427 		case NVME_ANA_NONOPTIMIZED:
428 			if (depth < min_depth_nonopt) {
429 				min_depth_nonopt = depth;
430 				best_nonopt = ns;
431 			}
432 			break;
433 		default:
434 			break;
435 		}
436 
437 		if (min_depth_opt == 0)
438 			return best_opt;
439 	}
440 
441 	return best_opt ? best_opt : best_nonopt;
442 }
443 
444 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
445 {
446 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
447 		ns->ana_state == NVME_ANA_OPTIMIZED;
448 }
449 
450 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
451 {
452 	int node = numa_node_id();
453 	struct nvme_ns *ns;
454 
455 	ns = srcu_dereference(head->current_path[node], &head->srcu);
456 	if (unlikely(!ns))
457 		return __nvme_find_path(head, node);
458 	if (unlikely(!nvme_path_is_optimized(ns)))
459 		return __nvme_find_path(head, node);
460 	return ns;
461 }
462 
463 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
464 {
465 	switch (READ_ONCE(head->subsys->iopolicy)) {
466 	case NVME_IOPOLICY_QD:
467 		return nvme_queue_depth_path(head);
468 	case NVME_IOPOLICY_RR:
469 		return nvme_round_robin_path(head);
470 	default:
471 		return nvme_numa_path(head);
472 	}
473 }
474 
475 static bool nvme_available_path(struct nvme_ns_head *head)
476 {
477 	struct nvme_ns *ns;
478 
479 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
480 		return false;
481 
482 	list_for_each_entry_srcu(ns, &head->list, siblings,
483 				 srcu_read_lock_held(&head->srcu)) {
484 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
485 			continue;
486 		switch (nvme_ctrl_state(ns->ctrl)) {
487 		case NVME_CTRL_LIVE:
488 		case NVME_CTRL_RESETTING:
489 		case NVME_CTRL_CONNECTING:
490 			return true;
491 		default:
492 			break;
493 		}
494 	}
495 
496 	/*
497 	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
498 	 * not immediately fail I/O. Instead, requeue the I/O for the configured
499 	 * duration, anticipating that if there's a transient link failure then
500 	 * it may recover within this time window. This parameter is exported to
501 	 * userspace via sysfs, and its default value is zero. It is internally
502 	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
503 	 * non-zero, this flag is set to true. When zero, the flag is cleared.
504 	 */
505 	return nvme_mpath_queue_if_no_path(head);
506 }
507 
508 static void nvme_ns_head_submit_bio(struct bio *bio)
509 {
510 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
511 	struct device *dev = disk_to_dev(head->disk);
512 	struct nvme_ns *ns;
513 	int srcu_idx;
514 
515 	/*
516 	 * The namespace might be going away and the bio might be moved to a
517 	 * different queue via blk_steal_bios(), so we need to use the bio_split
518 	 * pool from the original queue to allocate the bvecs from.
519 	 */
520 	bio = bio_split_to_limits(bio);
521 	if (!bio)
522 		return;
523 
524 	srcu_idx = srcu_read_lock(&head->srcu);
525 	ns = nvme_find_path(head);
526 	if (likely(ns)) {
527 		bio_set_dev(bio, ns->disk->part0);
528 		/*
529 		 * Use BIO_REMAPPED to skip bio_check_eod() when this bio
530 		 * enters submit_bio_noacct() for the per-path device. The EOD
531 		 * check already passed on the multipath head.
532 		 */
533 		bio_set_flag(bio, BIO_REMAPPED);
534 		bio->bi_opf |= REQ_NVME_MPATH;
535 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
536 				      bio->bi_iter.bi_sector);
537 		submit_bio_noacct(bio);
538 	} else if (nvme_available_path(head)) {
539 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
540 
541 		spin_lock_irq(&head->requeue_lock);
542 		bio_list_add(&head->requeue_list, bio);
543 		spin_unlock_irq(&head->requeue_lock);
544 		atomic_long_inc(&head->io_requeue_no_usable_path_count);
545 	} else {
546 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
547 
548 		bio_io_error(bio);
549 		atomic_long_inc(&head->io_fail_no_available_path_count);
550 	}
551 
552 	srcu_read_unlock(&head->srcu, srcu_idx);
553 }
554 
555 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
556 {
557 	if (!nvme_tryget_ns_head(disk->private_data))
558 		return -ENXIO;
559 	return 0;
560 }
561 
562 static void nvme_ns_head_release(struct gendisk *disk)
563 {
564 	nvme_put_ns_head(disk->private_data);
565 }
566 
567 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
568 		enum blk_unique_id type)
569 {
570 	struct nvme_ns_head *head = disk->private_data;
571 	struct nvme_ns *ns;
572 	int srcu_idx, ret = -EWOULDBLOCK;
573 
574 	srcu_idx = srcu_read_lock(&head->srcu);
575 	ns = nvme_find_path(head);
576 	if (ns)
577 		ret = nvme_ns_get_unique_id(ns, id, type);
578 	srcu_read_unlock(&head->srcu, srcu_idx);
579 	return ret;
580 }
581 
582 #ifdef CONFIG_BLK_DEV_ZONED
583 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
584 		unsigned int nr_zones, struct blk_report_zones_args *args)
585 {
586 	struct nvme_ns_head *head = disk->private_data;
587 	struct nvme_ns *ns;
588 	int srcu_idx, ret = -EWOULDBLOCK;
589 
590 	srcu_idx = srcu_read_lock(&head->srcu);
591 	ns = nvme_find_path(head);
592 	if (ns)
593 		ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
594 	srcu_read_unlock(&head->srcu, srcu_idx);
595 	return ret;
596 }
597 #else
598 #define nvme_ns_head_report_zones	NULL
599 #endif /* CONFIG_BLK_DEV_ZONED */
600 
601 const struct block_device_operations nvme_ns_head_ops = {
602 	.owner		= THIS_MODULE,
603 	.submit_bio	= nvme_ns_head_submit_bio,
604 	.open		= nvme_ns_head_open,
605 	.release	= nvme_ns_head_release,
606 	.ioctl		= nvme_ns_head_ioctl,
607 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
608 	.getgeo		= nvme_getgeo,
609 	.get_unique_id	= nvme_ns_head_get_unique_id,
610 	.report_zones	= nvme_ns_head_report_zones,
611 	.pr_ops		= &nvme_pr_ops,
612 };
613 
614 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
615 {
616 	return container_of(cdev, struct nvme_ns_head, cdev);
617 }
618 
619 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
620 {
621 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
622 		return -ENXIO;
623 	return 0;
624 }
625 
626 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
627 {
628 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
629 	return 0;
630 }
631 
632 static const struct file_operations nvme_ns_head_chr_fops = {
633 	.owner		= THIS_MODULE,
634 	.open		= nvme_ns_head_chr_open,
635 	.release	= nvme_ns_head_chr_release,
636 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
637 	.compat_ioctl	= compat_ptr_ioctl,
638 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
639 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
640 };
641 
642 static void nvme_add_ns_head_cdev(struct nvme_ns_head *head)
643 {
644 	char name[32];
645 
646 	head->cdev_device.parent = &head->subsys->dev;
647 	snprintf(name, sizeof(name), "ng%dn%d", head->subsys->instance,
648 		 head->instance);
649 
650 	if (nvme_cdev_add(name, &head->cdev, &head->cdev_device,
651 			&nvme_ns_head_chr_fops, THIS_MODULE)) {
652 		dev_err(disk_to_dev(head->disk),
653 			"Unable to create the %s device\n", name);
654 		return;
655 	}
656 	set_bit(NVME_NSHEAD_CDEV_LIVE, &head->flags);
657 }
658 
659 static void nvme_partition_scan_work(struct work_struct *work)
660 {
661 	struct nvme_ns_head *head =
662 		container_of(work, struct nvme_ns_head, partition_scan_work);
663 
664 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
665 					     &head->disk->state)))
666 		return;
667 
668 	mutex_lock(&head->disk->open_mutex);
669 	bdev_disk_changed(head->disk, false);
670 	mutex_unlock(&head->disk->open_mutex);
671 }
672 
673 static void nvme_requeue_work(struct work_struct *work)
674 {
675 	struct nvme_ns_head *head =
676 		container_of(work, struct nvme_ns_head, requeue_work);
677 	struct bio *bio, *next;
678 
679 	spin_lock_irq(&head->requeue_lock);
680 	next = bio_list_get(&head->requeue_list);
681 	spin_unlock_irq(&head->requeue_lock);
682 
683 	while ((bio = next) != NULL) {
684 		next = bio->bi_next;
685 		bio->bi_next = NULL;
686 
687 		submit_bio_noacct(bio);
688 	}
689 }
690 
691 static void nvme_remove_head(struct nvme_ns_head *head)
692 {
693 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
694 		/*
695 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
696 		 * to allow multipath to fail all I/O.
697 		 */
698 		kblockd_schedule_work(&head->requeue_work);
699 
700 		if (test_and_clear_bit(NVME_NSHEAD_CDEV_LIVE, &head->flags))
701 			nvme_cdev_del(&head->cdev, &head->cdev_device);
702 		synchronize_srcu(&head->srcu);
703 		del_gendisk(head->disk);
704 	}
705 	nvme_put_ns_head(head);
706 }
707 
708 static void nvme_remove_head_work(struct work_struct *work)
709 {
710 	struct nvme_ns_head *head = container_of(to_delayed_work(work),
711 			struct nvme_ns_head, remove_work);
712 	bool remove = false;
713 
714 	mutex_lock(&head->subsys->lock);
715 	if (list_empty(&head->list)) {
716 		list_del_init(&head->entry);
717 		remove = true;
718 	}
719 	mutex_unlock(&head->subsys->lock);
720 	if (remove)
721 		nvme_remove_head(head);
722 
723 	module_put(THIS_MODULE);
724 }
725 
726 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
727 {
728 	struct queue_limits lim;
729 
730 	mutex_init(&head->lock);
731 	bio_list_init(&head->requeue_list);
732 	spin_lock_init(&head->requeue_lock);
733 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
734 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
735 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
736 	head->delayed_removal_secs = 0;
737 
738 	/*
739 	 * If "multipath_always_on" is enabled, a multipath node is added
740 	 * regardless of whether the disk is single/multi ported, and whether
741 	 * the namespace is shared or private. If "multipath_always_on" is not
742 	 * enabled, a multipath node is added only if the subsystem supports
743 	 * multiple controllers and the "multipath" option is configured. In
744 	 * either case, for private namespaces, we ensure that the NSID is
745 	 * unique.
746 	 */
747 	if (!multipath_always_on) {
748 		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
749 				!multipath)
750 			return 0;
751 	}
752 
753 	if (!nvme_is_unique_nsid(ctrl, head))
754 		return 0;
755 
756 	blk_set_stacking_limits(&lim);
757 	lim.dma_alignment = 3;
758 	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
759 		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES | BLK_FEAT_PCI_P2PDMA;
760 	if (head->ids.csi == NVME_CSI_ZNS)
761 		lim.features |= BLK_FEAT_ZONED;
762 
763 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
764 	if (IS_ERR(head->disk))
765 		return PTR_ERR(head->disk);
766 	head->disk->fops = &nvme_ns_head_ops;
767 	head->disk->private_data = head;
768 
769 	/*
770 	 * We need to suppress the partition scan from occuring within the
771 	 * controller's scan_work context. If a path error occurs here, the IO
772 	 * will wait until a path becomes available or all paths are torn down,
773 	 * but that action also occurs within scan_work, so it would deadlock.
774 	 * Defer the partition scan to a different context that does not block
775 	 * scan_work.
776 	 */
777 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
778 	sprintf(head->disk->disk_name, "nvme%dn%d",
779 			ctrl->subsys->instance, head->instance);
780 	nvme_tryget_ns_head(head);
781 	return 0;
782 }
783 
784 static void nvme_mpath_set_live(struct nvme_ns *ns)
785 {
786 	struct nvme_ns_head *head = ns->head;
787 	int rc;
788 
789 	if (!head->disk)
790 		return;
791 
792 	/*
793 	 * test_and_set_bit() is used because it is protecting against two nvme
794 	 * paths simultaneously calling device_add_disk() on the same namespace
795 	 * head.
796 	 */
797 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
798 		rc = device_add_disk(&head->subsys->dev, head->disk,
799 				     nvme_ns_attr_groups);
800 		if (rc) {
801 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
802 			return;
803 		}
804 		nvme_add_ns_head_cdev(head);
805 		queue_work(nvme_wq, &head->partition_scan_work);
806 	}
807 
808 	nvme_mpath_add_sysfs_link(ns->head);
809 
810 	mutex_lock(&head->lock);
811 	if (nvme_path_is_optimized(ns)) {
812 		int node, srcu_idx;
813 
814 		srcu_idx = srcu_read_lock(&head->srcu);
815 		for_each_online_node(node)
816 			__nvme_find_path(head, node);
817 		srcu_read_unlock(&head->srcu, srcu_idx);
818 	}
819 	mutex_unlock(&head->lock);
820 
821 	synchronize_srcu(&head->srcu);
822 	kblockd_schedule_work(&head->requeue_work);
823 }
824 
825 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
826 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
827 			void *))
828 {
829 	void *base = ctrl->ana_log_buf;
830 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
831 	int error, i;
832 
833 	lockdep_assert_held(&ctrl->ana_lock);
834 
835 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
836 		struct nvme_ana_group_desc *desc = base + offset;
837 		u32 nr_nsids;
838 		size_t nsid_buf_size;
839 
840 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
841 			return -EINVAL;
842 
843 		nr_nsids = le32_to_cpu(desc->nnsids);
844 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
845 
846 		if (WARN_ON_ONCE(desc->grpid == 0))
847 			return -EINVAL;
848 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
849 			return -EINVAL;
850 		if (WARN_ON_ONCE(desc->state == 0))
851 			return -EINVAL;
852 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
853 			return -EINVAL;
854 
855 		offset += sizeof(*desc);
856 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
857 			return -EINVAL;
858 
859 		error = cb(ctrl, desc, data);
860 		if (error)
861 			return error;
862 
863 		offset += nsid_buf_size;
864 	}
865 
866 	return 0;
867 }
868 
869 static inline bool nvme_state_is_live(enum nvme_ana_state state)
870 {
871 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
872 }
873 
874 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
875 		struct nvme_ns *ns)
876 {
877 	ns->ana_grpid = le32_to_cpu(desc->grpid);
878 	ns->ana_state = desc->state;
879 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
880 	/*
881 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
882 	 * and in turn to this path device.  However we cannot accept this I/O
883 	 * if the controller is not live.  This may deadlock if called from
884 	 * nvme_mpath_init_identify() and the ctrl will never complete
885 	 * initialization, preventing I/O from completing.  For this case we
886 	 * will reprocess the ANA log page in nvme_mpath_update() once the
887 	 * controller is ready.
888 	 */
889 	if (nvme_state_is_live(ns->ana_state) &&
890 	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
891 		nvme_mpath_set_live(ns);
892 	else {
893 		/*
894 		 * Add sysfs link from multipath head gendisk node to path
895 		 * device gendisk node.
896 		 * If path's ana state is live (i.e. state is either optimized
897 		 * or non-optimized) while we alloc the ns then sysfs link would
898 		 * be created from nvme_mpath_set_live(). In that case we would
899 		 * not fallthrough this code path. However for the path's ana
900 		 * state other than live, we call nvme_mpath_set_live() only
901 		 * after ana state transitioned to the live state. But we still
902 		 * want to create the sysfs link from head node to a path device
903 		 * irrespctive of the path's ana state.
904 		 * If we reach through here then it means that path's ana state
905 		 * is not live but still create the sysfs link to this path from
906 		 * head node if head node of the path has already come alive.
907 		 */
908 		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
909 			nvme_mpath_add_sysfs_link(ns->head);
910 	}
911 }
912 
913 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
914 		struct nvme_ana_group_desc *desc, void *data)
915 {
916 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
917 	unsigned *nr_change_groups = data;
918 	struct nvme_ns *ns;
919 	int srcu_idx;
920 
921 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
922 			le32_to_cpu(desc->grpid),
923 			nvme_ana_state_names[desc->state]);
924 
925 	if (desc->state == NVME_ANA_CHANGE)
926 		(*nr_change_groups)++;
927 
928 	if (!nr_nsids)
929 		return 0;
930 
931 	srcu_idx = srcu_read_lock(&ctrl->srcu);
932 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
933 				 srcu_read_lock_held(&ctrl->srcu)) {
934 		unsigned nsid;
935 again:
936 		nsid = le32_to_cpu(desc->nsids[n]);
937 		if (ns->head->ns_id < nsid)
938 			continue;
939 		if (ns->head->ns_id == nsid)
940 			nvme_update_ns_ana_state(desc, ns);
941 		if (++n == nr_nsids)
942 			break;
943 		if (ns->head->ns_id > nsid)
944 			goto again;
945 	}
946 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
947 	return 0;
948 }
949 
950 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
951 {
952 	u32 nr_change_groups = 0;
953 	int error;
954 
955 	mutex_lock(&ctrl->ana_lock);
956 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
957 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
958 	if (error) {
959 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
960 		goto out_unlock;
961 	}
962 
963 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
964 			nvme_update_ana_state);
965 	if (error)
966 		goto out_unlock;
967 
968 	/*
969 	 * In theory we should have an ANATT timer per group as they might enter
970 	 * the change state at different times.  But that is a lot of overhead
971 	 * just to protect against a target that keeps entering new changes
972 	 * states while never finishing previous ones.  But we'll still
973 	 * eventually time out once all groups are in change state, so this
974 	 * isn't a big deal.
975 	 *
976 	 * We also double the ANATT value to provide some slack for transports
977 	 * or AEN processing overhead.
978 	 */
979 	if (nr_change_groups)
980 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
981 	else
982 		timer_delete_sync(&ctrl->anatt_timer);
983 out_unlock:
984 	mutex_unlock(&ctrl->ana_lock);
985 	return error;
986 }
987 
988 static void nvme_ana_work(struct work_struct *work)
989 {
990 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
991 
992 	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
993 		return;
994 
995 	nvme_read_ana_log(ctrl);
996 }
997 
998 void nvme_mpath_update(struct nvme_ctrl *ctrl)
999 {
1000 	u32 nr_change_groups = 0;
1001 
1002 	if (!ctrl->ana_log_buf)
1003 		return;
1004 
1005 	mutex_lock(&ctrl->ana_lock);
1006 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
1007 	mutex_unlock(&ctrl->ana_lock);
1008 }
1009 
1010 static void nvme_anatt_timeout(struct timer_list *t)
1011 {
1012 	struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
1013 
1014 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
1015 	nvme_reset_ctrl(ctrl);
1016 }
1017 
1018 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
1019 {
1020 	if (!nvme_ctrl_use_ana(ctrl))
1021 		return;
1022 	timer_delete_sync(&ctrl->anatt_timer);
1023 	cancel_work_sync(&ctrl->ana_work);
1024 }
1025 
1026 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
1027 	struct device_attribute subsys_attr_##_name =	\
1028 		__ATTR(_name, _mode, _show, _store)
1029 
1030 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1031 		struct device_attribute *attr, char *buf)
1032 {
1033 	struct nvme_subsystem *subsys =
1034 		container_of(dev, struct nvme_subsystem, dev);
1035 
1036 	return sysfs_emit(buf, "%s\n",
1037 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1038 }
1039 
1040 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1041 		int iopolicy)
1042 {
1043 	struct nvme_ctrl *ctrl;
1044 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
1045 
1046 	if (old_iopolicy == iopolicy)
1047 		return;
1048 
1049 	WRITE_ONCE(subsys->iopolicy, iopolicy);
1050 
1051 	/* iopolicy changes clear the mpath by design */
1052 	mutex_lock(&nvme_subsystems_lock);
1053 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1054 		nvme_mpath_clear_ctrl_paths(ctrl);
1055 	mutex_unlock(&nvme_subsystems_lock);
1056 
1057 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1058 			subsys->subnqn,
1059 			nvme_iopolicy_names[old_iopolicy],
1060 			nvme_iopolicy_names[iopolicy]);
1061 }
1062 
1063 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1064 		struct device_attribute *attr, const char *buf, size_t count)
1065 {
1066 	struct nvme_subsystem *subsys =
1067 		container_of(dev, struct nvme_subsystem, dev);
1068 	int policy;
1069 
1070 	policy = nvme_iopolicy_parse(buf);
1071 	if (policy < 0)
1072 		return policy;
1073 
1074 	nvme_subsys_iopolicy_update(subsys, policy);
1075 	return count;
1076 }
1077 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1078 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1079 
1080 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1081 		char *buf)
1082 {
1083 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1084 }
1085 DEVICE_ATTR_RO(ana_grpid);
1086 
1087 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1088 		char *buf)
1089 {
1090 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1091 
1092 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1093 }
1094 DEVICE_ATTR_RO(ana_state);
1095 
1096 static ssize_t queue_depth_show(struct device *dev,
1097 		struct device_attribute *attr, char *buf)
1098 {
1099 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1100 
1101 	if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1102 		return 0;
1103 
1104 	return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1105 }
1106 DEVICE_ATTR_RO(queue_depth);
1107 
1108 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1109 		char *buf)
1110 {
1111 	int node, srcu_idx;
1112 	nodemask_t numa_nodes;
1113 	struct nvme_ns *current_ns;
1114 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1115 	struct nvme_ns_head *head = ns->head;
1116 
1117 	if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1118 		return 0;
1119 
1120 	nodes_clear(numa_nodes);
1121 
1122 	srcu_idx = srcu_read_lock(&head->srcu);
1123 	for_each_node(node) {
1124 		current_ns = srcu_dereference(head->current_path[node],
1125 				&head->srcu);
1126 		if (ns == current_ns)
1127 			node_set(node, numa_nodes);
1128 	}
1129 	srcu_read_unlock(&head->srcu, srcu_idx);
1130 
1131 	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1132 }
1133 DEVICE_ATTR_RO(numa_nodes);
1134 
1135 static ssize_t delayed_removal_secs_show(struct device *dev,
1136 		struct device_attribute *attr, char *buf)
1137 {
1138 	struct gendisk *disk = dev_to_disk(dev);
1139 	struct nvme_ns_head *head = disk->private_data;
1140 	int ret;
1141 
1142 	mutex_lock(&head->subsys->lock);
1143 	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1144 	mutex_unlock(&head->subsys->lock);
1145 	return ret;
1146 }
1147 
1148 static ssize_t delayed_removal_secs_store(struct device *dev,
1149 		struct device_attribute *attr, const char *buf, size_t count)
1150 {
1151 	struct gendisk *disk = dev_to_disk(dev);
1152 	struct nvme_ns_head *head = disk->private_data;
1153 	unsigned int sec;
1154 	int ret;
1155 
1156 	ret = kstrtouint(buf, 0, &sec);
1157 	if (ret < 0)
1158 		return ret;
1159 
1160 	mutex_lock(&head->subsys->lock);
1161 	head->delayed_removal_secs = sec;
1162 	if (sec)
1163 		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1164 	else
1165 		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1166 	mutex_unlock(&head->subsys->lock);
1167 	/*
1168 	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1169 	 * by its reader.
1170 	 */
1171 	synchronize_srcu(&head->srcu);
1172 
1173 	return count;
1174 }
1175 
1176 DEVICE_ATTR_RW(delayed_removal_secs);
1177 
1178 static ssize_t multipath_failover_count_show(struct device *dev,
1179 		struct device_attribute *attr, char *buf)
1180 {
1181 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1182 
1183 	return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->failover));
1184 }
1185 
1186 static ssize_t multipath_failover_count_store(struct device *dev,
1187 		struct device_attribute *attr, const char *buf, size_t count)
1188 {
1189 	unsigned long failover;
1190 	int ret;
1191 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1192 
1193 	ret = kstrtoul(buf, 0, &failover);
1194 	if (ret)
1195 		return -EINVAL;
1196 
1197 	atomic_long_set(&ns->failover, failover);
1198 
1199 	return count;
1200 }
1201 
1202 DEVICE_ATTR_RW(multipath_failover_count);
1203 
1204 static ssize_t io_requeue_no_usable_path_count_show(struct device *dev,
1205 		struct device_attribute *attr, char *buf)
1206 {
1207 	struct gendisk *disk = dev_to_disk(dev);
1208 	struct nvme_ns_head *head = disk->private_data;
1209 
1210 	return sysfs_emit(buf, "%lu\n",
1211 		    atomic_long_read(&head->io_requeue_no_usable_path_count));
1212 }
1213 
1214 static ssize_t io_requeue_no_usable_path_count_store(struct device *dev,
1215 		struct device_attribute *attr, const char *buf, size_t count)
1216 {
1217 	int err;
1218 	unsigned long requeue_cnt;
1219 	struct gendisk *disk = dev_to_disk(dev);
1220 	struct nvme_ns_head *head = disk->private_data;
1221 
1222 	err = kstrtoul(buf, 0, &requeue_cnt);
1223 	if (err)
1224 		return -EINVAL;
1225 
1226 	atomic_long_set(&head->io_requeue_no_usable_path_count, requeue_cnt);
1227 
1228 	return count;
1229 }
1230 
1231 DEVICE_ATTR_RW(io_requeue_no_usable_path_count);
1232 
1233 static ssize_t io_fail_no_available_path_count_show(struct device *dev,
1234 		struct device_attribute *attr, char *buf)
1235 {
1236 	struct gendisk *disk = dev_to_disk(dev);
1237 	struct nvme_ns_head *head = disk->private_data;
1238 
1239 	return sysfs_emit(buf, "%lu\n",
1240 		    atomic_long_read(&head->io_fail_no_available_path_count));
1241 }
1242 
1243 static ssize_t io_fail_no_available_path_count_store(struct device *dev,
1244 		struct device_attribute *attr, const char *buf, size_t count)
1245 {
1246 	int err;
1247 	unsigned long fail_cnt;
1248 	struct gendisk *disk = dev_to_disk(dev);
1249 	struct nvme_ns_head *head = disk->private_data;
1250 
1251 	err = kstrtoul(buf, 0, &fail_cnt);
1252 	if (err)
1253 		return -EINVAL;
1254 
1255 	atomic_long_set(&head->io_fail_no_available_path_count, fail_cnt);
1256 
1257 	return count;
1258 }
1259 
1260 DEVICE_ATTR_RW(io_fail_no_available_path_count);
1261 
1262 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1263 		struct nvme_ana_group_desc *desc, void *data)
1264 {
1265 	struct nvme_ana_group_desc *dst = data;
1266 
1267 	if (desc->grpid != dst->grpid)
1268 		return 0;
1269 
1270 	*dst = *desc;
1271 	return -ENXIO; /* just break out of the loop */
1272 }
1273 
1274 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1275 {
1276 	struct device *target;
1277 	int rc, srcu_idx;
1278 	struct nvme_ns *ns;
1279 	struct kobject *kobj;
1280 
1281 	/*
1282 	 * Ensure head disk node is already added otherwise we may get invalid
1283 	 * kobj for head disk node
1284 	 */
1285 	if (!test_bit(GD_ADDED, &head->disk->state))
1286 		return;
1287 
1288 	kobj = &disk_to_dev(head->disk)->kobj;
1289 
1290 	/*
1291 	 * loop through each ns chained through the head->list and create the
1292 	 * sysfs link from head node to the ns path node
1293 	 */
1294 	srcu_idx = srcu_read_lock(&head->srcu);
1295 
1296 	list_for_each_entry_srcu(ns, &head->list, siblings,
1297 				 srcu_read_lock_held(&head->srcu)) {
1298 		/*
1299 		 * Ensure that ns path disk node is already added otherwise we
1300 		 * may get invalid kobj name for target
1301 		 */
1302 		if (!test_bit(GD_ADDED, &ns->disk->state))
1303 			continue;
1304 
1305 		/*
1306 		 * Avoid creating link if it already exists for the given path.
1307 		 * When path ana state transitions from optimized to non-
1308 		 * optimized or vice-versa, the nvme_mpath_set_live() is
1309 		 * invoked which in truns call this function. Now if the sysfs
1310 		 * link already exists for the given path and we attempt to re-
1311 		 * create the link then sysfs code would warn about it loudly.
1312 		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1313 		 * that we're not creating duplicate link.
1314 		 * The test_and_set_bit() is used because it is protecting
1315 		 * against multiple nvme paths being simultaneously added.
1316 		 */
1317 		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1318 			continue;
1319 
1320 		target = disk_to_dev(ns->disk);
1321 		/*
1322 		 * Create sysfs link from head gendisk kobject @kobj to the
1323 		 * ns path gendisk kobject @target->kobj.
1324 		 */
1325 		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1326 				&target->kobj, dev_name(target));
1327 		if (unlikely(rc)) {
1328 			dev_err(disk_to_dev(ns->head->disk),
1329 					"failed to create link to %s\n",
1330 					dev_name(target));
1331 			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1332 		}
1333 	}
1334 
1335 	srcu_read_unlock(&head->srcu, srcu_idx);
1336 }
1337 
1338 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1339 {
1340 	struct device *target;
1341 	struct kobject *kobj;
1342 
1343 	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1344 		return;
1345 
1346 	target = disk_to_dev(ns->disk);
1347 	kobj = &disk_to_dev(ns->head->disk)->kobj;
1348 	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1349 			dev_name(target));
1350 	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1351 }
1352 
1353 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1354 {
1355 	if (nvme_ctrl_use_ana(ns->ctrl)) {
1356 		struct nvme_ana_group_desc desc = {
1357 			.grpid = anagrpid,
1358 			.state = 0,
1359 		};
1360 
1361 		mutex_lock(&ns->ctrl->ana_lock);
1362 		ns->ana_grpid = le32_to_cpu(anagrpid);
1363 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1364 		mutex_unlock(&ns->ctrl->ana_lock);
1365 		if (desc.state) {
1366 			/* found the group desc: update */
1367 			nvme_update_ns_ana_state(&desc, ns);
1368 		} else {
1369 			/* group desc not found: trigger a re-read */
1370 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1371 			queue_work(nvme_wq, &ns->ctrl->ana_work);
1372 		}
1373 	} else {
1374 		ns->ana_state = NVME_ANA_OPTIMIZED;
1375 		nvme_mpath_set_live(ns);
1376 	}
1377 
1378 #ifdef CONFIG_BLK_DEV_ZONED
1379 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1380 		ns->head->disk->nr_zones = ns->disk->nr_zones;
1381 #endif
1382 }
1383 
1384 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1385 {
1386 	bool remove = false;
1387 
1388 	if (!head->disk)
1389 		return;
1390 
1391 	mutex_lock(&head->subsys->lock);
1392 	/*
1393 	 * We are called when all paths have been removed, and at that point
1394 	 * head->list is expected to be empty. However, nvme_ns_remove() and
1395 	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1396 	 * removal_secs is configured, it is possible that by the time we reach
1397 	 * this point, head->list may no longer be empty. Therefore, we recheck
1398 	 * head->list here. If it is no longer empty then we skip enqueuing the
1399 	 * delayed head removal work.
1400 	 */
1401 	if (!list_empty(&head->list))
1402 		goto out;
1403 
1404 	/*
1405 	 * Ensure that no one could remove this module while the head
1406 	 * remove work is pending.
1407 	 */
1408 	if (head->delayed_removal_secs && try_module_get(THIS_MODULE)) {
1409 		mod_delayed_work(nvme_wq, &head->remove_work,
1410 				head->delayed_removal_secs * HZ);
1411 	} else {
1412 		list_del_init(&head->entry);
1413 		remove = true;
1414 	}
1415 out:
1416 	mutex_unlock(&head->subsys->lock);
1417 	if (remove)
1418 		nvme_remove_head(head);
1419 }
1420 
1421 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1422 {
1423 	if (!head->disk)
1424 		return;
1425 	/* make sure all pending bios are cleaned up */
1426 	kblockd_schedule_work(&head->requeue_work);
1427 	flush_work(&head->requeue_work);
1428 	flush_work(&head->partition_scan_work);
1429 	put_disk(head->disk);
1430 }
1431 
1432 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1433 {
1434 	mutex_init(&ctrl->ana_lock);
1435 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1436 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1437 }
1438 
1439 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1440 {
1441 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1442 	size_t ana_log_size;
1443 	int error = 0;
1444 
1445 	/* check if multipath is enabled and we have the capability */
1446 	if (!multipath || !ctrl->subsys ||
1447 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1448 		return 0;
1449 
1450 	/* initialize this in the identify path to cover controller resets */
1451 	atomic_set(&ctrl->nr_active, 0);
1452 
1453 	if (!ctrl->max_namespaces ||
1454 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1455 		dev_err(ctrl->device,
1456 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1457 		return -EINVAL;
1458 	}
1459 
1460 	ctrl->anacap = id->anacap;
1461 	ctrl->anatt = id->anatt;
1462 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1463 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1464 
1465 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1466 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1467 		ctrl->max_namespaces * sizeof(__le32);
1468 	if (ana_log_size > max_transfer_size) {
1469 		dev_err(ctrl->device,
1470 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
1471 			ana_log_size, max_transfer_size);
1472 		dev_err(ctrl->device, "disabling ANA support.\n");
1473 		goto out_uninit;
1474 	}
1475 	if (ana_log_size > ctrl->ana_log_size) {
1476 		nvme_mpath_stop(ctrl);
1477 		nvme_mpath_uninit(ctrl);
1478 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1479 		if (!ctrl->ana_log_buf)
1480 			return -ENOMEM;
1481 	}
1482 	ctrl->ana_log_size = ana_log_size;
1483 	error = nvme_read_ana_log(ctrl);
1484 	if (error)
1485 		goto out_uninit;
1486 	return 0;
1487 
1488 out_uninit:
1489 	nvme_mpath_uninit(ctrl);
1490 	return error;
1491 }
1492 
1493 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1494 {
1495 	kvfree(ctrl->ana_log_buf);
1496 	ctrl->ana_log_buf = NULL;
1497 	ctrl->ana_log_size = 0;
1498 }
1499