xref: /linux/drivers/nvme/host/multipath.c (revision ed60c09f292f1383bbcf79dcf61b6257bbb3a503)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2018 Christoph Hellwig.
4  */
5 
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11 
12 bool multipath = true;
13 static bool multipath_always_on;
14 
15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 	int ret;
18 	bool *arg = kp->arg;
19 
20 	ret = param_set_bool(val, kp);
21 	if (ret)
22 		return ret;
23 
24 	if (multipath_always_on && !*arg) {
25 		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 		*arg = true;
27 		return -EINVAL;
28 	}
29 
30 	return 0;
31 }
32 
33 static const struct kernel_param_ops multipath_param_ops = {
34 	.set = multipath_param_set,
35 	.get = param_get_bool,
36 };
37 
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 	"turn on native support for multiple controllers per subsystem");
41 
42 static int multipath_always_on_set(const char *val,
43 		const struct kernel_param *kp)
44 {
45 	int ret;
46 	bool *arg = kp->arg;
47 
48 	ret = param_set_bool(val, kp);
49 	if (ret < 0)
50 		return ret;
51 
52 	if (*arg)
53 		multipath = true;
54 
55 	return 0;
56 }
57 
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 	.set = multipath_always_on_set,
60 	.get = param_get_bool,
61 };
62 
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 		&multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67 
68 static const char *nvme_iopolicy_names[] = {
69 	[NVME_IOPOLICY_NUMA]	= "numa",
70 	[NVME_IOPOLICY_RR]	= "round-robin",
71 	[NVME_IOPOLICY_QD]      = "queue-depth",
72 };
73 
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75 
76 static int nvme_iopolicy_parse(const char *str)
77 {
78 	int i;
79 
80 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
81 		if (sysfs_streq(str, nvme_iopolicy_names[i]))
82 			return i;
83 	}
84 	return -EINVAL;
85 }
86 
87 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
88 {
89 	int policy;
90 
91 	if (!val)
92 		return -EINVAL;
93 
94 	policy = nvme_iopolicy_parse(val);
95 	if (policy < 0)
96 		return policy;
97 
98 	iopolicy = policy;
99 	return 0;
100 }
101 
102 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
103 {
104 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
105 }
106 
107 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
108 	&iopolicy, 0644);
109 MODULE_PARM_DESC(iopolicy,
110 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
111 
112 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
113 {
114 	subsys->iopolicy = iopolicy;
115 }
116 
117 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
118 {
119 	struct nvme_ns_head *h;
120 
121 	lockdep_assert_held(&subsys->lock);
122 	list_for_each_entry(h, &subsys->nsheads, entry)
123 		if (h->disk)
124 			blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
125 }
126 
127 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
128 {
129 	struct nvme_ns_head *h;
130 
131 	lockdep_assert_held(&subsys->lock);
132 	list_for_each_entry(h, &subsys->nsheads, entry)
133 		if (h->disk)
134 			blk_mq_freeze_queue_wait(h->disk->queue);
135 }
136 
137 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
138 {
139 	struct nvme_ns_head *h;
140 
141 	lockdep_assert_held(&subsys->lock);
142 	list_for_each_entry(h, &subsys->nsheads, entry)
143 		if (h->disk)
144 			blk_freeze_queue_start(h->disk->queue);
145 }
146 
147 void nvme_failover_req(struct request *req)
148 {
149 	struct nvme_ns *ns = req->q->queuedata;
150 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
151 	unsigned long flags;
152 	struct bio *bio;
153 
154 	nvme_mpath_clear_current_path(ns);
155 	atomic_long_inc(&ns->failover);
156 
157 	/*
158 	 * If we got back an ANA error, we know the controller is alive but not
159 	 * ready to serve this namespace.  Kick of a re-read of the ANA
160 	 * information page, and just try any other available path for now.
161 	 */
162 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
163 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
164 		queue_work(nvme_wq, &ns->ctrl->ana_work);
165 	}
166 
167 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
168 	for (bio = req->bio; bio; bio = bio->bi_next)
169 		bio_set_dev(bio, ns->head->disk->part0);
170 	blk_steal_bios(&ns->head->requeue_list, req);
171 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
172 
173 	nvme_req(req)->status = 0;
174 	nvme_end_req(req);
175 	kblockd_schedule_work(&ns->head->requeue_work);
176 }
177 
178 void nvme_mpath_start_request(struct request *rq)
179 {
180 	struct nvme_ns *ns = rq->q->queuedata;
181 	struct gendisk *disk = ns->head->disk;
182 
183 	if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
184 	    !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
185 		atomic_inc(&ns->ctrl->nr_active);
186 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
187 	}
188 
189 	if (!blk_queue_io_stat(disk->queue) ||
190 	    (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
191 		return;
192 	if (blk_rq_is_passthrough(rq) &&
193 	    !blk_rq_passthrough_stats(rq, disk->queue))
194 		return;
195 
196 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
197 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
198 						      jiffies);
199 }
200 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
201 
202 void nvme_mpath_end_request(struct request *rq)
203 {
204 	struct nvme_ns *ns = rq->q->queuedata;
205 
206 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
207 		atomic_dec_if_positive(&ns->ctrl->nr_active);
208 
209 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
210 		return;
211 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
212 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
213 			 nvme_req(rq)->start_time);
214 }
215 
216 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
217 {
218 	struct nvme_ns *ns;
219 	int srcu_idx;
220 
221 	srcu_idx = srcu_read_lock(&ctrl->srcu);
222 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
223 				 srcu_read_lock_held(&ctrl->srcu)) {
224 		if (!ns->head->disk)
225 			continue;
226 		kblockd_schedule_work(&ns->head->requeue_work);
227 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
228 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
229 	}
230 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
231 }
232 
233 static const char *nvme_ana_state_names[] = {
234 	[0]				= "invalid state",
235 	[NVME_ANA_OPTIMIZED]		= "optimized",
236 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
237 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
238 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
239 	[NVME_ANA_CHANGE]		= "change",
240 };
241 
242 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
243 {
244 	struct nvme_ns_head *head = ns->head;
245 	bool changed = false;
246 	int node;
247 
248 	for_each_node(node) {
249 		if (ns == rcu_access_pointer(head->current_path[node])) {
250 			rcu_assign_pointer(head->current_path[node], NULL);
251 			changed = true;
252 		}
253 	}
254 	return changed;
255 }
256 
257 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
258 {
259 	struct nvme_ns *ns;
260 	int srcu_idx;
261 
262 	srcu_idx = srcu_read_lock(&ctrl->srcu);
263 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
264 				 srcu_read_lock_held(&ctrl->srcu)) {
265 		nvme_mpath_clear_current_path(ns);
266 		kblockd_schedule_work(&ns->head->requeue_work);
267 	}
268 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
269 }
270 
271 void nvme_mpath_revalidate_paths(struct nvme_ns_head *head)
272 {
273 	sector_t capacity = get_capacity(head->disk);
274 	struct nvme_ns *ns;
275 	int node;
276 	int srcu_idx;
277 
278 	srcu_idx = srcu_read_lock(&head->srcu);
279 	list_for_each_entry_srcu(ns, &head->list, siblings,
280 				 srcu_read_lock_held(&head->srcu)) {
281 		if (capacity != get_capacity(ns->disk))
282 			clear_bit(NVME_NS_READY, &ns->flags);
283 	}
284 	srcu_read_unlock(&head->srcu, srcu_idx);
285 
286 	for_each_node(node)
287 		rcu_assign_pointer(head->current_path[node], NULL);
288 	kblockd_schedule_work(&head->requeue_work);
289 }
290 
291 static bool nvme_path_is_disabled(struct nvme_ns *ns)
292 {
293 	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
294 
295 	/*
296 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
297 	 * still be able to complete assuming that the controller is connected.
298 	 * Otherwise it will fail immediately and return to the requeue list.
299 	 */
300 	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
301 		return true;
302 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
303 	    !test_bit(NVME_NS_READY, &ns->flags))
304 		return true;
305 	return false;
306 }
307 
308 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
309 {
310 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
311 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
312 
313 	list_for_each_entry_srcu(ns, &head->list, siblings,
314 				 srcu_read_lock_held(&head->srcu)) {
315 		if (nvme_path_is_disabled(ns))
316 			continue;
317 
318 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
319 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
320 			distance = node_distance(node, ns->ctrl->numa_node);
321 		else
322 			distance = LOCAL_DISTANCE;
323 
324 		switch (ns->ana_state) {
325 		case NVME_ANA_OPTIMIZED:
326 			if (distance < found_distance) {
327 				found_distance = distance;
328 				found = ns;
329 			}
330 			break;
331 		case NVME_ANA_NONOPTIMIZED:
332 			if (distance < fallback_distance) {
333 				fallback_distance = distance;
334 				fallback = ns;
335 			}
336 			break;
337 		default:
338 			break;
339 		}
340 	}
341 
342 	if (!found)
343 		found = fallback;
344 	if (found)
345 		rcu_assign_pointer(head->current_path[node], found);
346 	return found;
347 }
348 
349 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
350 		struct nvme_ns *ns)
351 {
352 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
353 			siblings);
354 	if (ns)
355 		return ns;
356 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
357 }
358 
359 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
360 {
361 	struct nvme_ns *ns, *found = NULL;
362 	int node = numa_node_id();
363 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
364 					       &head->srcu);
365 
366 	if (unlikely(!old))
367 		return __nvme_find_path(head, node);
368 
369 	if (list_is_singular(&head->list)) {
370 		if (nvme_path_is_disabled(old))
371 			return NULL;
372 		return old;
373 	}
374 
375 	for (ns = nvme_next_ns(head, old);
376 	     ns && ns != old;
377 	     ns = nvme_next_ns(head, ns)) {
378 		if (nvme_path_is_disabled(ns))
379 			continue;
380 
381 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
382 			found = ns;
383 			goto out;
384 		}
385 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
386 			found = ns;
387 	}
388 
389 	/*
390 	 * The loop above skips the current path for round-robin semantics.
391 	 * Fall back to the current path if either:
392 	 *  - no other optimized path found and current is optimized,
393 	 *  - no other usable path found and current is usable.
394 	 */
395 	if (!nvme_path_is_disabled(old) &&
396 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
397 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
398 		return old;
399 
400 	if (!found)
401 		return NULL;
402 out:
403 	rcu_assign_pointer(head->current_path[node], found);
404 	return found;
405 }
406 
407 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
408 {
409 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
410 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
411 	unsigned int depth;
412 
413 	list_for_each_entry_srcu(ns, &head->list, siblings,
414 				 srcu_read_lock_held(&head->srcu)) {
415 		if (nvme_path_is_disabled(ns))
416 			continue;
417 
418 		depth = atomic_read(&ns->ctrl->nr_active);
419 
420 		switch (ns->ana_state) {
421 		case NVME_ANA_OPTIMIZED:
422 			if (depth < min_depth_opt) {
423 				min_depth_opt = depth;
424 				best_opt = ns;
425 			}
426 			break;
427 		case NVME_ANA_NONOPTIMIZED:
428 			if (depth < min_depth_nonopt) {
429 				min_depth_nonopt = depth;
430 				best_nonopt = ns;
431 			}
432 			break;
433 		default:
434 			break;
435 		}
436 
437 		if (min_depth_opt == 0)
438 			return best_opt;
439 	}
440 
441 	return best_opt ? best_opt : best_nonopt;
442 }
443 
444 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
445 {
446 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
447 		ns->ana_state == NVME_ANA_OPTIMIZED;
448 }
449 
450 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
451 {
452 	int node = numa_node_id();
453 	struct nvme_ns *ns;
454 
455 	ns = srcu_dereference(head->current_path[node], &head->srcu);
456 	if (unlikely(!ns))
457 		return __nvme_find_path(head, node);
458 	if (unlikely(!nvme_path_is_optimized(ns)))
459 		return __nvme_find_path(head, node);
460 	return ns;
461 }
462 
463 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
464 {
465 	switch (READ_ONCE(head->subsys->iopolicy)) {
466 	case NVME_IOPOLICY_QD:
467 		return nvme_queue_depth_path(head);
468 	case NVME_IOPOLICY_RR:
469 		return nvme_round_robin_path(head);
470 	default:
471 		return nvme_numa_path(head);
472 	}
473 }
474 
475 static bool nvme_available_path(struct nvme_ns_head *head)
476 {
477 	struct nvme_ns *ns;
478 
479 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
480 		return false;
481 
482 	list_for_each_entry_srcu(ns, &head->list, siblings,
483 				 srcu_read_lock_held(&head->srcu)) {
484 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
485 			continue;
486 		switch (nvme_ctrl_state(ns->ctrl)) {
487 		case NVME_CTRL_LIVE:
488 		case NVME_CTRL_RESETTING:
489 		case NVME_CTRL_CONNECTING:
490 			return true;
491 		default:
492 			break;
493 		}
494 	}
495 
496 	/*
497 	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
498 	 * not immediately fail I/O. Instead, requeue the I/O for the configured
499 	 * duration, anticipating that if there's a transient link failure then
500 	 * it may recover within this time window. This parameter is exported to
501 	 * userspace via sysfs, and its default value is zero. It is internally
502 	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
503 	 * non-zero, this flag is set to true. When zero, the flag is cleared.
504 	 */
505 	return nvme_mpath_queue_if_no_path(head);
506 }
507 
508 static void nvme_ns_head_submit_bio(struct bio *bio)
509 {
510 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
511 	struct device *dev = disk_to_dev(head->disk);
512 	struct nvme_ns *ns;
513 	int srcu_idx;
514 
515 	/*
516 	 * The namespace might be going away and the bio might be moved to a
517 	 * different queue via blk_steal_bios(), so we need to use the bio_split
518 	 * pool from the original queue to allocate the bvecs from.
519 	 */
520 	bio = bio_split_to_limits(bio);
521 	if (!bio)
522 		return;
523 
524 	srcu_idx = srcu_read_lock(&head->srcu);
525 	ns = nvme_find_path(head);
526 	if (likely(ns)) {
527 		bio_set_dev(bio, ns->disk->part0);
528 		/*
529 		 * Use BIO_REMAPPED to skip bio_check_eod() when this bio
530 		 * enters submit_bio_noacct() for the per-path device. The EOD
531 		 * check already passed on the multipath head.
532 		 */
533 		bio_set_flag(bio, BIO_REMAPPED);
534 		bio->bi_opf |= REQ_NVME_MPATH;
535 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
536 				      bio->bi_iter.bi_sector);
537 		submit_bio_noacct(bio);
538 	} else if (nvme_available_path(head)) {
539 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
540 
541 		spin_lock_irq(&head->requeue_lock);
542 		bio_list_add(&head->requeue_list, bio);
543 		spin_unlock_irq(&head->requeue_lock);
544 		atomic_long_inc(&head->io_requeue_no_usable_path_count);
545 	} else {
546 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
547 
548 		bio_io_error(bio);
549 		atomic_long_inc(&head->io_fail_no_available_path_count);
550 	}
551 
552 	srcu_read_unlock(&head->srcu, srcu_idx);
553 }
554 
555 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
556 {
557 	if (!nvme_tryget_ns_head(disk->private_data))
558 		return -ENXIO;
559 	return 0;
560 }
561 
562 static void nvme_ns_head_release(struct gendisk *disk)
563 {
564 	nvme_put_ns_head(disk->private_data);
565 }
566 
567 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
568 		enum blk_unique_id type)
569 {
570 	struct nvme_ns_head *head = disk->private_data;
571 	struct nvme_ns *ns;
572 	int srcu_idx, ret = -EWOULDBLOCK;
573 
574 	srcu_idx = srcu_read_lock(&head->srcu);
575 	ns = nvme_find_path(head);
576 	if (ns)
577 		ret = nvme_ns_get_unique_id(ns, id, type);
578 	srcu_read_unlock(&head->srcu, srcu_idx);
579 	return ret;
580 }
581 
582 #ifdef CONFIG_BLK_DEV_ZONED
583 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
584 		unsigned int nr_zones, struct blk_report_zones_args *args)
585 {
586 	struct nvme_ns_head *head = disk->private_data;
587 	struct nvme_ns *ns;
588 	int srcu_idx, ret = -EWOULDBLOCK;
589 
590 	srcu_idx = srcu_read_lock(&head->srcu);
591 	ns = nvme_find_path(head);
592 	if (ns)
593 		ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
594 	srcu_read_unlock(&head->srcu, srcu_idx);
595 	return ret;
596 }
597 #else
598 #define nvme_ns_head_report_zones	NULL
599 #endif /* CONFIG_BLK_DEV_ZONED */
600 
601 const struct block_device_operations nvme_ns_head_ops = {
602 	.owner		= THIS_MODULE,
603 	.submit_bio	= nvme_ns_head_submit_bio,
604 	.open		= nvme_ns_head_open,
605 	.release	= nvme_ns_head_release,
606 	.ioctl		= nvme_ns_head_ioctl,
607 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
608 	.getgeo		= nvme_getgeo,
609 	.get_unique_id	= nvme_ns_head_get_unique_id,
610 	.report_zones	= nvme_ns_head_report_zones,
611 	.pr_ops		= &nvme_pr_ops,
612 };
613 
614 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
615 {
616 	return container_of(cdev, struct nvme_ns_head, cdev);
617 }
618 
619 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
620 {
621 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
622 		return -ENXIO;
623 	return 0;
624 }
625 
626 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
627 {
628 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
629 	return 0;
630 }
631 
632 static const struct file_operations nvme_ns_head_chr_fops = {
633 	.owner		= THIS_MODULE,
634 	.open		= nvme_ns_head_chr_open,
635 	.release	= nvme_ns_head_chr_release,
636 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
637 	.compat_ioctl	= compat_ptr_ioctl,
638 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
639 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
640 };
641 
642 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
643 {
644 	int ret;
645 
646 	head->cdev_device.parent = &head->subsys->dev;
647 	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
648 			   head->subsys->instance, head->instance);
649 	if (ret)
650 		return ret;
651 	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
652 			    &nvme_ns_head_chr_fops, THIS_MODULE);
653 	return ret;
654 }
655 
656 static void nvme_partition_scan_work(struct work_struct *work)
657 {
658 	struct nvme_ns_head *head =
659 		container_of(work, struct nvme_ns_head, partition_scan_work);
660 
661 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
662 					     &head->disk->state)))
663 		return;
664 
665 	mutex_lock(&head->disk->open_mutex);
666 	bdev_disk_changed(head->disk, false);
667 	mutex_unlock(&head->disk->open_mutex);
668 }
669 
670 static void nvme_requeue_work(struct work_struct *work)
671 {
672 	struct nvme_ns_head *head =
673 		container_of(work, struct nvme_ns_head, requeue_work);
674 	struct bio *bio, *next;
675 
676 	spin_lock_irq(&head->requeue_lock);
677 	next = bio_list_get(&head->requeue_list);
678 	spin_unlock_irq(&head->requeue_lock);
679 
680 	while ((bio = next) != NULL) {
681 		next = bio->bi_next;
682 		bio->bi_next = NULL;
683 
684 		submit_bio_noacct(bio);
685 	}
686 }
687 
688 static void nvme_remove_head(struct nvme_ns_head *head)
689 {
690 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
691 		/*
692 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
693 		 * to allow multipath to fail all I/O.
694 		 */
695 		kblockd_schedule_work(&head->requeue_work);
696 
697 		nvme_cdev_del(&head->cdev, &head->cdev_device);
698 		synchronize_srcu(&head->srcu);
699 		del_gendisk(head->disk);
700 	}
701 	nvme_put_ns_head(head);
702 }
703 
704 static void nvme_remove_head_work(struct work_struct *work)
705 {
706 	struct nvme_ns_head *head = container_of(to_delayed_work(work),
707 			struct nvme_ns_head, remove_work);
708 	bool remove = false;
709 
710 	mutex_lock(&head->subsys->lock);
711 	if (list_empty(&head->list)) {
712 		list_del_init(&head->entry);
713 		remove = true;
714 	}
715 	mutex_unlock(&head->subsys->lock);
716 	if (remove)
717 		nvme_remove_head(head);
718 
719 	module_put(THIS_MODULE);
720 }
721 
722 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
723 {
724 	struct queue_limits lim;
725 
726 	mutex_init(&head->lock);
727 	bio_list_init(&head->requeue_list);
728 	spin_lock_init(&head->requeue_lock);
729 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
730 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
731 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
732 	head->delayed_removal_secs = 0;
733 
734 	/*
735 	 * If "multipath_always_on" is enabled, a multipath node is added
736 	 * regardless of whether the disk is single/multi ported, and whether
737 	 * the namespace is shared or private. If "multipath_always_on" is not
738 	 * enabled, a multipath node is added only if the subsystem supports
739 	 * multiple controllers and the "multipath" option is configured. In
740 	 * either case, for private namespaces, we ensure that the NSID is
741 	 * unique.
742 	 */
743 	if (!multipath_always_on) {
744 		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
745 				!multipath)
746 			return 0;
747 	}
748 
749 	if (!nvme_is_unique_nsid(ctrl, head))
750 		return 0;
751 
752 	blk_set_stacking_limits(&lim);
753 	lim.dma_alignment = 3;
754 	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
755 		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES | BLK_FEAT_PCI_P2PDMA;
756 	if (head->ids.csi == NVME_CSI_ZNS)
757 		lim.features |= BLK_FEAT_ZONED;
758 
759 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
760 	if (IS_ERR(head->disk))
761 		return PTR_ERR(head->disk);
762 	head->disk->fops = &nvme_ns_head_ops;
763 	head->disk->private_data = head;
764 
765 	/*
766 	 * We need to suppress the partition scan from occuring within the
767 	 * controller's scan_work context. If a path error occurs here, the IO
768 	 * will wait until a path becomes available or all paths are torn down,
769 	 * but that action also occurs within scan_work, so it would deadlock.
770 	 * Defer the partition scan to a different context that does not block
771 	 * scan_work.
772 	 */
773 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
774 	sprintf(head->disk->disk_name, "nvme%dn%d",
775 			ctrl->subsys->instance, head->instance);
776 	nvme_tryget_ns_head(head);
777 	return 0;
778 }
779 
780 static void nvme_mpath_set_live(struct nvme_ns *ns)
781 {
782 	struct nvme_ns_head *head = ns->head;
783 	int rc;
784 
785 	if (!head->disk)
786 		return;
787 
788 	/*
789 	 * test_and_set_bit() is used because it is protecting against two nvme
790 	 * paths simultaneously calling device_add_disk() on the same namespace
791 	 * head.
792 	 */
793 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
794 		rc = device_add_disk(&head->subsys->dev, head->disk,
795 				     nvme_ns_attr_groups);
796 		if (rc) {
797 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
798 			return;
799 		}
800 		nvme_add_ns_head_cdev(head);
801 		queue_work(nvme_wq, &head->partition_scan_work);
802 	}
803 
804 	nvme_mpath_add_sysfs_link(ns->head);
805 
806 	mutex_lock(&head->lock);
807 	if (nvme_path_is_optimized(ns)) {
808 		int node, srcu_idx;
809 
810 		srcu_idx = srcu_read_lock(&head->srcu);
811 		for_each_online_node(node)
812 			__nvme_find_path(head, node);
813 		srcu_read_unlock(&head->srcu, srcu_idx);
814 	}
815 	mutex_unlock(&head->lock);
816 
817 	synchronize_srcu(&head->srcu);
818 	kblockd_schedule_work(&head->requeue_work);
819 }
820 
821 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
822 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
823 			void *))
824 {
825 	void *base = ctrl->ana_log_buf;
826 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
827 	int error, i;
828 
829 	lockdep_assert_held(&ctrl->ana_lock);
830 
831 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
832 		struct nvme_ana_group_desc *desc = base + offset;
833 		u32 nr_nsids;
834 		size_t nsid_buf_size;
835 
836 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
837 			return -EINVAL;
838 
839 		nr_nsids = le32_to_cpu(desc->nnsids);
840 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
841 
842 		if (WARN_ON_ONCE(desc->grpid == 0))
843 			return -EINVAL;
844 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
845 			return -EINVAL;
846 		if (WARN_ON_ONCE(desc->state == 0))
847 			return -EINVAL;
848 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
849 			return -EINVAL;
850 
851 		offset += sizeof(*desc);
852 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
853 			return -EINVAL;
854 
855 		error = cb(ctrl, desc, data);
856 		if (error)
857 			return error;
858 
859 		offset += nsid_buf_size;
860 	}
861 
862 	return 0;
863 }
864 
865 static inline bool nvme_state_is_live(enum nvme_ana_state state)
866 {
867 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
868 }
869 
870 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
871 		struct nvme_ns *ns)
872 {
873 	ns->ana_grpid = le32_to_cpu(desc->grpid);
874 	ns->ana_state = desc->state;
875 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
876 	/*
877 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
878 	 * and in turn to this path device.  However we cannot accept this I/O
879 	 * if the controller is not live.  This may deadlock if called from
880 	 * nvme_mpath_init_identify() and the ctrl will never complete
881 	 * initialization, preventing I/O from completing.  For this case we
882 	 * will reprocess the ANA log page in nvme_mpath_update() once the
883 	 * controller is ready.
884 	 */
885 	if (nvme_state_is_live(ns->ana_state) &&
886 	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
887 		nvme_mpath_set_live(ns);
888 	else {
889 		/*
890 		 * Add sysfs link from multipath head gendisk node to path
891 		 * device gendisk node.
892 		 * If path's ana state is live (i.e. state is either optimized
893 		 * or non-optimized) while we alloc the ns then sysfs link would
894 		 * be created from nvme_mpath_set_live(). In that case we would
895 		 * not fallthrough this code path. However for the path's ana
896 		 * state other than live, we call nvme_mpath_set_live() only
897 		 * after ana state transitioned to the live state. But we still
898 		 * want to create the sysfs link from head node to a path device
899 		 * irrespctive of the path's ana state.
900 		 * If we reach through here then it means that path's ana state
901 		 * is not live but still create the sysfs link to this path from
902 		 * head node if head node of the path has already come alive.
903 		 */
904 		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
905 			nvme_mpath_add_sysfs_link(ns->head);
906 	}
907 }
908 
909 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
910 		struct nvme_ana_group_desc *desc, void *data)
911 {
912 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
913 	unsigned *nr_change_groups = data;
914 	struct nvme_ns *ns;
915 	int srcu_idx;
916 
917 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
918 			le32_to_cpu(desc->grpid),
919 			nvme_ana_state_names[desc->state]);
920 
921 	if (desc->state == NVME_ANA_CHANGE)
922 		(*nr_change_groups)++;
923 
924 	if (!nr_nsids)
925 		return 0;
926 
927 	srcu_idx = srcu_read_lock(&ctrl->srcu);
928 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
929 				 srcu_read_lock_held(&ctrl->srcu)) {
930 		unsigned nsid;
931 again:
932 		nsid = le32_to_cpu(desc->nsids[n]);
933 		if (ns->head->ns_id < nsid)
934 			continue;
935 		if (ns->head->ns_id == nsid)
936 			nvme_update_ns_ana_state(desc, ns);
937 		if (++n == nr_nsids)
938 			break;
939 		if (ns->head->ns_id > nsid)
940 			goto again;
941 	}
942 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
943 	return 0;
944 }
945 
946 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
947 {
948 	u32 nr_change_groups = 0;
949 	int error;
950 
951 	mutex_lock(&ctrl->ana_lock);
952 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
953 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
954 	if (error) {
955 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
956 		goto out_unlock;
957 	}
958 
959 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
960 			nvme_update_ana_state);
961 	if (error)
962 		goto out_unlock;
963 
964 	/*
965 	 * In theory we should have an ANATT timer per group as they might enter
966 	 * the change state at different times.  But that is a lot of overhead
967 	 * just to protect against a target that keeps entering new changes
968 	 * states while never finishing previous ones.  But we'll still
969 	 * eventually time out once all groups are in change state, so this
970 	 * isn't a big deal.
971 	 *
972 	 * We also double the ANATT value to provide some slack for transports
973 	 * or AEN processing overhead.
974 	 */
975 	if (nr_change_groups)
976 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
977 	else
978 		timer_delete_sync(&ctrl->anatt_timer);
979 out_unlock:
980 	mutex_unlock(&ctrl->ana_lock);
981 	return error;
982 }
983 
984 static void nvme_ana_work(struct work_struct *work)
985 {
986 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
987 
988 	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
989 		return;
990 
991 	nvme_read_ana_log(ctrl);
992 }
993 
994 void nvme_mpath_update(struct nvme_ctrl *ctrl)
995 {
996 	u32 nr_change_groups = 0;
997 
998 	if (!ctrl->ana_log_buf)
999 		return;
1000 
1001 	mutex_lock(&ctrl->ana_lock);
1002 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
1003 	mutex_unlock(&ctrl->ana_lock);
1004 }
1005 
1006 static void nvme_anatt_timeout(struct timer_list *t)
1007 {
1008 	struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
1009 
1010 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
1011 	nvme_reset_ctrl(ctrl);
1012 }
1013 
1014 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
1015 {
1016 	if (!nvme_ctrl_use_ana(ctrl))
1017 		return;
1018 	timer_delete_sync(&ctrl->anatt_timer);
1019 	cancel_work_sync(&ctrl->ana_work);
1020 }
1021 
1022 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
1023 	struct device_attribute subsys_attr_##_name =	\
1024 		__ATTR(_name, _mode, _show, _store)
1025 
1026 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1027 		struct device_attribute *attr, char *buf)
1028 {
1029 	struct nvme_subsystem *subsys =
1030 		container_of(dev, struct nvme_subsystem, dev);
1031 
1032 	return sysfs_emit(buf, "%s\n",
1033 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1034 }
1035 
1036 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1037 		int iopolicy)
1038 {
1039 	struct nvme_ctrl *ctrl;
1040 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
1041 
1042 	if (old_iopolicy == iopolicy)
1043 		return;
1044 
1045 	WRITE_ONCE(subsys->iopolicy, iopolicy);
1046 
1047 	/* iopolicy changes clear the mpath by design */
1048 	mutex_lock(&nvme_subsystems_lock);
1049 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1050 		nvme_mpath_clear_ctrl_paths(ctrl);
1051 	mutex_unlock(&nvme_subsystems_lock);
1052 
1053 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1054 			subsys->subnqn,
1055 			nvme_iopolicy_names[old_iopolicy],
1056 			nvme_iopolicy_names[iopolicy]);
1057 }
1058 
1059 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1060 		struct device_attribute *attr, const char *buf, size_t count)
1061 {
1062 	struct nvme_subsystem *subsys =
1063 		container_of(dev, struct nvme_subsystem, dev);
1064 	int policy;
1065 
1066 	policy = nvme_iopolicy_parse(buf);
1067 	if (policy < 0)
1068 		return policy;
1069 
1070 	nvme_subsys_iopolicy_update(subsys, policy);
1071 	return count;
1072 }
1073 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1074 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1075 
1076 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1077 		char *buf)
1078 {
1079 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1080 }
1081 DEVICE_ATTR_RO(ana_grpid);
1082 
1083 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1084 		char *buf)
1085 {
1086 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1087 
1088 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1089 }
1090 DEVICE_ATTR_RO(ana_state);
1091 
1092 static ssize_t queue_depth_show(struct device *dev,
1093 		struct device_attribute *attr, char *buf)
1094 {
1095 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1096 
1097 	if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1098 		return 0;
1099 
1100 	return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1101 }
1102 DEVICE_ATTR_RO(queue_depth);
1103 
1104 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1105 		char *buf)
1106 {
1107 	int node, srcu_idx;
1108 	nodemask_t numa_nodes;
1109 	struct nvme_ns *current_ns;
1110 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1111 	struct nvme_ns_head *head = ns->head;
1112 
1113 	if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1114 		return 0;
1115 
1116 	nodes_clear(numa_nodes);
1117 
1118 	srcu_idx = srcu_read_lock(&head->srcu);
1119 	for_each_node(node) {
1120 		current_ns = srcu_dereference(head->current_path[node],
1121 				&head->srcu);
1122 		if (ns == current_ns)
1123 			node_set(node, numa_nodes);
1124 	}
1125 	srcu_read_unlock(&head->srcu, srcu_idx);
1126 
1127 	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1128 }
1129 DEVICE_ATTR_RO(numa_nodes);
1130 
1131 static ssize_t delayed_removal_secs_show(struct device *dev,
1132 		struct device_attribute *attr, char *buf)
1133 {
1134 	struct gendisk *disk = dev_to_disk(dev);
1135 	struct nvme_ns_head *head = disk->private_data;
1136 	int ret;
1137 
1138 	mutex_lock(&head->subsys->lock);
1139 	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1140 	mutex_unlock(&head->subsys->lock);
1141 	return ret;
1142 }
1143 
1144 static ssize_t delayed_removal_secs_store(struct device *dev,
1145 		struct device_attribute *attr, const char *buf, size_t count)
1146 {
1147 	struct gendisk *disk = dev_to_disk(dev);
1148 	struct nvme_ns_head *head = disk->private_data;
1149 	unsigned int sec;
1150 	int ret;
1151 
1152 	ret = kstrtouint(buf, 0, &sec);
1153 	if (ret < 0)
1154 		return ret;
1155 
1156 	mutex_lock(&head->subsys->lock);
1157 	head->delayed_removal_secs = sec;
1158 	if (sec)
1159 		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1160 	else
1161 		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1162 	mutex_unlock(&head->subsys->lock);
1163 	/*
1164 	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1165 	 * by its reader.
1166 	 */
1167 	synchronize_srcu(&head->srcu);
1168 
1169 	return count;
1170 }
1171 
1172 DEVICE_ATTR_RW(delayed_removal_secs);
1173 
1174 static ssize_t multipath_failover_count_show(struct device *dev,
1175 		struct device_attribute *attr, char *buf)
1176 {
1177 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1178 
1179 	return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->failover));
1180 }
1181 
1182 static ssize_t multipath_failover_count_store(struct device *dev,
1183 		struct device_attribute *attr, const char *buf, size_t count)
1184 {
1185 	unsigned long failover;
1186 	int ret;
1187 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1188 
1189 	ret = kstrtoul(buf, 0, &failover);
1190 	if (ret)
1191 		return -EINVAL;
1192 
1193 	atomic_long_set(&ns->failover, failover);
1194 
1195 	return count;
1196 }
1197 
1198 DEVICE_ATTR_RW(multipath_failover_count);
1199 
1200 static ssize_t io_requeue_no_usable_path_count_show(struct device *dev,
1201 		struct device_attribute *attr, char *buf)
1202 {
1203 	struct gendisk *disk = dev_to_disk(dev);
1204 	struct nvme_ns_head *head = disk->private_data;
1205 
1206 	return sysfs_emit(buf, "%lu\n",
1207 		    atomic_long_read(&head->io_requeue_no_usable_path_count));
1208 }
1209 
1210 static ssize_t io_requeue_no_usable_path_count_store(struct device *dev,
1211 		struct device_attribute *attr, const char *buf, size_t count)
1212 {
1213 	int err;
1214 	unsigned long requeue_cnt;
1215 	struct gendisk *disk = dev_to_disk(dev);
1216 	struct nvme_ns_head *head = disk->private_data;
1217 
1218 	err = kstrtoul(buf, 0, &requeue_cnt);
1219 	if (err)
1220 		return -EINVAL;
1221 
1222 	atomic_long_set(&head->io_requeue_no_usable_path_count, requeue_cnt);
1223 
1224 	return count;
1225 }
1226 
1227 DEVICE_ATTR_RW(io_requeue_no_usable_path_count);
1228 
1229 static ssize_t io_fail_no_available_path_count_show(struct device *dev,
1230 		struct device_attribute *attr, char *buf)
1231 {
1232 	struct gendisk *disk = dev_to_disk(dev);
1233 	struct nvme_ns_head *head = disk->private_data;
1234 
1235 	return sysfs_emit(buf, "%lu\n",
1236 		    atomic_long_read(&head->io_fail_no_available_path_count));
1237 }
1238 
1239 static ssize_t io_fail_no_available_path_count_store(struct device *dev,
1240 		struct device_attribute *attr, const char *buf, size_t count)
1241 {
1242 	int err;
1243 	unsigned long fail_cnt;
1244 	struct gendisk *disk = dev_to_disk(dev);
1245 	struct nvme_ns_head *head = disk->private_data;
1246 
1247 	err = kstrtoul(buf, 0, &fail_cnt);
1248 	if (err)
1249 		return -EINVAL;
1250 
1251 	atomic_long_set(&head->io_fail_no_available_path_count, fail_cnt);
1252 
1253 	return count;
1254 }
1255 
1256 DEVICE_ATTR_RW(io_fail_no_available_path_count);
1257 
1258 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1259 		struct nvme_ana_group_desc *desc, void *data)
1260 {
1261 	struct nvme_ana_group_desc *dst = data;
1262 
1263 	if (desc->grpid != dst->grpid)
1264 		return 0;
1265 
1266 	*dst = *desc;
1267 	return -ENXIO; /* just break out of the loop */
1268 }
1269 
1270 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1271 {
1272 	struct device *target;
1273 	int rc, srcu_idx;
1274 	struct nvme_ns *ns;
1275 	struct kobject *kobj;
1276 
1277 	/*
1278 	 * Ensure head disk node is already added otherwise we may get invalid
1279 	 * kobj for head disk node
1280 	 */
1281 	if (!test_bit(GD_ADDED, &head->disk->state))
1282 		return;
1283 
1284 	kobj = &disk_to_dev(head->disk)->kobj;
1285 
1286 	/*
1287 	 * loop through each ns chained through the head->list and create the
1288 	 * sysfs link from head node to the ns path node
1289 	 */
1290 	srcu_idx = srcu_read_lock(&head->srcu);
1291 
1292 	list_for_each_entry_srcu(ns, &head->list, siblings,
1293 				 srcu_read_lock_held(&head->srcu)) {
1294 		/*
1295 		 * Ensure that ns path disk node is already added otherwise we
1296 		 * may get invalid kobj name for target
1297 		 */
1298 		if (!test_bit(GD_ADDED, &ns->disk->state))
1299 			continue;
1300 
1301 		/*
1302 		 * Avoid creating link if it already exists for the given path.
1303 		 * When path ana state transitions from optimized to non-
1304 		 * optimized or vice-versa, the nvme_mpath_set_live() is
1305 		 * invoked which in truns call this function. Now if the sysfs
1306 		 * link already exists for the given path and we attempt to re-
1307 		 * create the link then sysfs code would warn about it loudly.
1308 		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1309 		 * that we're not creating duplicate link.
1310 		 * The test_and_set_bit() is used because it is protecting
1311 		 * against multiple nvme paths being simultaneously added.
1312 		 */
1313 		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1314 			continue;
1315 
1316 		target = disk_to_dev(ns->disk);
1317 		/*
1318 		 * Create sysfs link from head gendisk kobject @kobj to the
1319 		 * ns path gendisk kobject @target->kobj.
1320 		 */
1321 		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1322 				&target->kobj, dev_name(target));
1323 		if (unlikely(rc)) {
1324 			dev_err(disk_to_dev(ns->head->disk),
1325 					"failed to create link to %s\n",
1326 					dev_name(target));
1327 			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1328 		}
1329 	}
1330 
1331 	srcu_read_unlock(&head->srcu, srcu_idx);
1332 }
1333 
1334 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1335 {
1336 	struct device *target;
1337 	struct kobject *kobj;
1338 
1339 	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1340 		return;
1341 
1342 	target = disk_to_dev(ns->disk);
1343 	kobj = &disk_to_dev(ns->head->disk)->kobj;
1344 	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1345 			dev_name(target));
1346 	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1347 }
1348 
1349 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1350 {
1351 	if (nvme_ctrl_use_ana(ns->ctrl)) {
1352 		struct nvme_ana_group_desc desc = {
1353 			.grpid = anagrpid,
1354 			.state = 0,
1355 		};
1356 
1357 		mutex_lock(&ns->ctrl->ana_lock);
1358 		ns->ana_grpid = le32_to_cpu(anagrpid);
1359 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1360 		mutex_unlock(&ns->ctrl->ana_lock);
1361 		if (desc.state) {
1362 			/* found the group desc: update */
1363 			nvme_update_ns_ana_state(&desc, ns);
1364 		} else {
1365 			/* group desc not found: trigger a re-read */
1366 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1367 			queue_work(nvme_wq, &ns->ctrl->ana_work);
1368 		}
1369 	} else {
1370 		ns->ana_state = NVME_ANA_OPTIMIZED;
1371 		nvme_mpath_set_live(ns);
1372 	}
1373 
1374 #ifdef CONFIG_BLK_DEV_ZONED
1375 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1376 		ns->head->disk->nr_zones = ns->disk->nr_zones;
1377 #endif
1378 }
1379 
1380 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1381 {
1382 	bool remove = false;
1383 
1384 	if (!head->disk)
1385 		return;
1386 
1387 	mutex_lock(&head->subsys->lock);
1388 	/*
1389 	 * We are called when all paths have been removed, and at that point
1390 	 * head->list is expected to be empty. However, nvme_ns_remove() and
1391 	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1392 	 * removal_secs is configured, it is possible that by the time we reach
1393 	 * this point, head->list may no longer be empty. Therefore, we recheck
1394 	 * head->list here. If it is no longer empty then we skip enqueuing the
1395 	 * delayed head removal work.
1396 	 */
1397 	if (!list_empty(&head->list))
1398 		goto out;
1399 
1400 	/*
1401 	 * Ensure that no one could remove this module while the head
1402 	 * remove work is pending.
1403 	 */
1404 	if (head->delayed_removal_secs && try_module_get(THIS_MODULE)) {
1405 		mod_delayed_work(nvme_wq, &head->remove_work,
1406 				head->delayed_removal_secs * HZ);
1407 	} else {
1408 		list_del_init(&head->entry);
1409 		remove = true;
1410 	}
1411 out:
1412 	mutex_unlock(&head->subsys->lock);
1413 	if (remove)
1414 		nvme_remove_head(head);
1415 }
1416 
1417 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1418 {
1419 	if (!head->disk)
1420 		return;
1421 	/* make sure all pending bios are cleaned up */
1422 	kblockd_schedule_work(&head->requeue_work);
1423 	flush_work(&head->requeue_work);
1424 	flush_work(&head->partition_scan_work);
1425 	put_disk(head->disk);
1426 }
1427 
1428 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1429 {
1430 	mutex_init(&ctrl->ana_lock);
1431 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1432 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1433 }
1434 
1435 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1436 {
1437 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1438 	size_t ana_log_size;
1439 	int error = 0;
1440 
1441 	/* check if multipath is enabled and we have the capability */
1442 	if (!multipath || !ctrl->subsys ||
1443 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1444 		return 0;
1445 
1446 	/* initialize this in the identify path to cover controller resets */
1447 	atomic_set(&ctrl->nr_active, 0);
1448 
1449 	if (!ctrl->max_namespaces ||
1450 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1451 		dev_err(ctrl->device,
1452 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1453 		return -EINVAL;
1454 	}
1455 
1456 	ctrl->anacap = id->anacap;
1457 	ctrl->anatt = id->anatt;
1458 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1459 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1460 
1461 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1462 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1463 		ctrl->max_namespaces * sizeof(__le32);
1464 	if (ana_log_size > max_transfer_size) {
1465 		dev_err(ctrl->device,
1466 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
1467 			ana_log_size, max_transfer_size);
1468 		dev_err(ctrl->device, "disabling ANA support.\n");
1469 		goto out_uninit;
1470 	}
1471 	if (ana_log_size > ctrl->ana_log_size) {
1472 		nvme_mpath_stop(ctrl);
1473 		nvme_mpath_uninit(ctrl);
1474 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1475 		if (!ctrl->ana_log_buf)
1476 			return -ENOMEM;
1477 	}
1478 	ctrl->ana_log_size = ana_log_size;
1479 	error = nvme_read_ana_log(ctrl);
1480 	if (error)
1481 		goto out_uninit;
1482 	return 0;
1483 
1484 out_uninit:
1485 	nvme_mpath_uninit(ctrl);
1486 	return error;
1487 }
1488 
1489 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1490 {
1491 	kvfree(ctrl->ana_log_buf);
1492 	ctrl->ana_log_buf = NULL;
1493 	ctrl->ana_log_size = 0;
1494 }
1495