xref: /linux/drivers/nvme/host/multipath.c (revision b2c45ced591e6cf947560d2d290a51855926b774)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2018 Christoph Hellwig.
4  */
5 
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11 
12 bool multipath = true;
13 static bool multipath_always_on;
14 
15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 	int ret;
18 	bool *arg = kp->arg;
19 
20 	ret = param_set_bool(val, kp);
21 	if (ret)
22 		return ret;
23 
24 	if (multipath_always_on && !*arg) {
25 		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 		*arg = true;
27 		return -EINVAL;
28 	}
29 
30 	return 0;
31 }
32 
33 static const struct kernel_param_ops multipath_param_ops = {
34 	.set = multipath_param_set,
35 	.get = param_get_bool,
36 };
37 
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 	"turn on native support for multiple controllers per subsystem");
41 
42 static int multipath_always_on_set(const char *val,
43 		const struct kernel_param *kp)
44 {
45 	int ret;
46 	bool *arg = kp->arg;
47 
48 	ret = param_set_bool(val, kp);
49 	if (ret < 0)
50 		return ret;
51 
52 	if (*arg)
53 		multipath = true;
54 
55 	return 0;
56 }
57 
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 	.set = multipath_always_on_set,
60 	.get = param_get_bool,
61 };
62 
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 		&multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67 
68 static const char *nvme_iopolicy_names[] = {
69 	[NVME_IOPOLICY_NUMA]	= "numa",
70 	[NVME_IOPOLICY_RR]	= "round-robin",
71 	[NVME_IOPOLICY_QD]      = "queue-depth",
72 };
73 
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75 
76 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
77 {
78 	if (!val)
79 		return -EINVAL;
80 	if (!strncmp(val, "numa", 4))
81 		iopolicy = NVME_IOPOLICY_NUMA;
82 	else if (!strncmp(val, "round-robin", 11))
83 		iopolicy = NVME_IOPOLICY_RR;
84 	else if (!strncmp(val, "queue-depth", 11))
85 		iopolicy = NVME_IOPOLICY_QD;
86 	else
87 		return -EINVAL;
88 
89 	return 0;
90 }
91 
92 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
93 {
94 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
95 }
96 
97 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
98 	&iopolicy, 0644);
99 MODULE_PARM_DESC(iopolicy,
100 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
101 
102 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
103 {
104 	subsys->iopolicy = iopolicy;
105 }
106 
107 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
108 {
109 	struct nvme_ns_head *h;
110 
111 	lockdep_assert_held(&subsys->lock);
112 	list_for_each_entry(h, &subsys->nsheads, entry)
113 		if (h->disk)
114 			blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
115 }
116 
117 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
118 {
119 	struct nvme_ns_head *h;
120 
121 	lockdep_assert_held(&subsys->lock);
122 	list_for_each_entry(h, &subsys->nsheads, entry)
123 		if (h->disk)
124 			blk_mq_freeze_queue_wait(h->disk->queue);
125 }
126 
127 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
128 {
129 	struct nvme_ns_head *h;
130 
131 	lockdep_assert_held(&subsys->lock);
132 	list_for_each_entry(h, &subsys->nsheads, entry)
133 		if (h->disk)
134 			blk_freeze_queue_start(h->disk->queue);
135 }
136 
137 void nvme_failover_req(struct request *req)
138 {
139 	struct nvme_ns *ns = req->q->queuedata;
140 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
141 	unsigned long flags;
142 	struct bio *bio;
143 
144 	nvme_mpath_clear_current_path(ns);
145 
146 	/*
147 	 * If we got back an ANA error, we know the controller is alive but not
148 	 * ready to serve this namespace.  Kick of a re-read of the ANA
149 	 * information page, and just try any other available path for now.
150 	 */
151 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
152 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
153 		queue_work(nvme_wq, &ns->ctrl->ana_work);
154 	}
155 
156 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
157 	for (bio = req->bio; bio; bio = bio->bi_next)
158 		bio_set_dev(bio, ns->head->disk->part0);
159 	blk_steal_bios(&ns->head->requeue_list, req);
160 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
161 
162 	nvme_req(req)->status = 0;
163 	nvme_end_req(req);
164 	kblockd_schedule_work(&ns->head->requeue_work);
165 }
166 
167 void nvme_mpath_start_request(struct request *rq)
168 {
169 	struct nvme_ns *ns = rq->q->queuedata;
170 	struct gendisk *disk = ns->head->disk;
171 
172 	if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
173 	    !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
174 		atomic_inc(&ns->ctrl->nr_active);
175 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
176 	}
177 
178 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) ||
179 	    (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
180 		return;
181 
182 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
183 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
184 						      jiffies);
185 }
186 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
187 
188 void nvme_mpath_end_request(struct request *rq)
189 {
190 	struct nvme_ns *ns = rq->q->queuedata;
191 
192 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
193 		atomic_dec_if_positive(&ns->ctrl->nr_active);
194 
195 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
196 		return;
197 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
198 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
199 			 nvme_req(rq)->start_time);
200 }
201 
202 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
203 {
204 	struct nvme_ns *ns;
205 	int srcu_idx;
206 
207 	srcu_idx = srcu_read_lock(&ctrl->srcu);
208 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
209 				 srcu_read_lock_held(&ctrl->srcu)) {
210 		if (!ns->head->disk)
211 			continue;
212 		kblockd_schedule_work(&ns->head->requeue_work);
213 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
214 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
215 	}
216 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
217 }
218 
219 static const char *nvme_ana_state_names[] = {
220 	[0]				= "invalid state",
221 	[NVME_ANA_OPTIMIZED]		= "optimized",
222 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
223 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
224 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
225 	[NVME_ANA_CHANGE]		= "change",
226 };
227 
228 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
229 {
230 	struct nvme_ns_head *head = ns->head;
231 	bool changed = false;
232 	int node;
233 
234 	if (!head)
235 		goto out;
236 
237 	for_each_node(node) {
238 		if (ns == rcu_access_pointer(head->current_path[node])) {
239 			rcu_assign_pointer(head->current_path[node], NULL);
240 			changed = true;
241 		}
242 	}
243 out:
244 	return changed;
245 }
246 
247 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
248 {
249 	struct nvme_ns *ns;
250 	int srcu_idx;
251 
252 	srcu_idx = srcu_read_lock(&ctrl->srcu);
253 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
254 				 srcu_read_lock_held(&ctrl->srcu)) {
255 		nvme_mpath_clear_current_path(ns);
256 		kblockd_schedule_work(&ns->head->requeue_work);
257 	}
258 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
259 }
260 
261 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
262 {
263 	struct nvme_ns_head *head = ns->head;
264 	sector_t capacity = get_capacity(head->disk);
265 	int node;
266 	int srcu_idx;
267 
268 	srcu_idx = srcu_read_lock(&head->srcu);
269 	list_for_each_entry_srcu(ns, &head->list, siblings,
270 				 srcu_read_lock_held(&head->srcu)) {
271 		if (capacity != get_capacity(ns->disk))
272 			clear_bit(NVME_NS_READY, &ns->flags);
273 	}
274 	srcu_read_unlock(&head->srcu, srcu_idx);
275 
276 	for_each_node(node)
277 		rcu_assign_pointer(head->current_path[node], NULL);
278 	kblockd_schedule_work(&head->requeue_work);
279 }
280 
281 static bool nvme_path_is_disabled(struct nvme_ns *ns)
282 {
283 	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
284 
285 	/*
286 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
287 	 * still be able to complete assuming that the controller is connected.
288 	 * Otherwise it will fail immediately and return to the requeue list.
289 	 */
290 	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
291 		return true;
292 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
293 	    !test_bit(NVME_NS_READY, &ns->flags))
294 		return true;
295 	return false;
296 }
297 
298 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
299 {
300 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
301 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
302 
303 	list_for_each_entry_srcu(ns, &head->list, siblings,
304 				 srcu_read_lock_held(&head->srcu)) {
305 		if (nvme_path_is_disabled(ns))
306 			continue;
307 
308 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
309 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
310 			distance = node_distance(node, ns->ctrl->numa_node);
311 		else
312 			distance = LOCAL_DISTANCE;
313 
314 		switch (ns->ana_state) {
315 		case NVME_ANA_OPTIMIZED:
316 			if (distance < found_distance) {
317 				found_distance = distance;
318 				found = ns;
319 			}
320 			break;
321 		case NVME_ANA_NONOPTIMIZED:
322 			if (distance < fallback_distance) {
323 				fallback_distance = distance;
324 				fallback = ns;
325 			}
326 			break;
327 		default:
328 			break;
329 		}
330 	}
331 
332 	if (!found)
333 		found = fallback;
334 	if (found)
335 		rcu_assign_pointer(head->current_path[node], found);
336 	return found;
337 }
338 
339 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
340 		struct nvme_ns *ns)
341 {
342 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
343 			siblings);
344 	if (ns)
345 		return ns;
346 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
347 }
348 
349 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
350 {
351 	struct nvme_ns *ns, *found = NULL;
352 	int node = numa_node_id();
353 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
354 					       &head->srcu);
355 
356 	if (unlikely(!old))
357 		return __nvme_find_path(head, node);
358 
359 	if (list_is_singular(&head->list)) {
360 		if (nvme_path_is_disabled(old))
361 			return NULL;
362 		return old;
363 	}
364 
365 	for (ns = nvme_next_ns(head, old);
366 	     ns && ns != old;
367 	     ns = nvme_next_ns(head, ns)) {
368 		if (nvme_path_is_disabled(ns))
369 			continue;
370 
371 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
372 			found = ns;
373 			goto out;
374 		}
375 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
376 			found = ns;
377 	}
378 
379 	/*
380 	 * The loop above skips the current path for round-robin semantics.
381 	 * Fall back to the current path if either:
382 	 *  - no other optimized path found and current is optimized,
383 	 *  - no other usable path found and current is usable.
384 	 */
385 	if (!nvme_path_is_disabled(old) &&
386 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
387 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
388 		return old;
389 
390 	if (!found)
391 		return NULL;
392 out:
393 	rcu_assign_pointer(head->current_path[node], found);
394 	return found;
395 }
396 
397 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
398 {
399 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
400 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
401 	unsigned int depth;
402 
403 	list_for_each_entry_srcu(ns, &head->list, siblings,
404 				 srcu_read_lock_held(&head->srcu)) {
405 		if (nvme_path_is_disabled(ns))
406 			continue;
407 
408 		depth = atomic_read(&ns->ctrl->nr_active);
409 
410 		switch (ns->ana_state) {
411 		case NVME_ANA_OPTIMIZED:
412 			if (depth < min_depth_opt) {
413 				min_depth_opt = depth;
414 				best_opt = ns;
415 			}
416 			break;
417 		case NVME_ANA_NONOPTIMIZED:
418 			if (depth < min_depth_nonopt) {
419 				min_depth_nonopt = depth;
420 				best_nonopt = ns;
421 			}
422 			break;
423 		default:
424 			break;
425 		}
426 
427 		if (min_depth_opt == 0)
428 			return best_opt;
429 	}
430 
431 	return best_opt ? best_opt : best_nonopt;
432 }
433 
434 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
435 {
436 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
437 		ns->ana_state == NVME_ANA_OPTIMIZED;
438 }
439 
440 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
441 {
442 	int node = numa_node_id();
443 	struct nvme_ns *ns;
444 
445 	ns = srcu_dereference(head->current_path[node], &head->srcu);
446 	if (unlikely(!ns))
447 		return __nvme_find_path(head, node);
448 	if (unlikely(!nvme_path_is_optimized(ns)))
449 		return __nvme_find_path(head, node);
450 	return ns;
451 }
452 
453 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
454 {
455 	switch (READ_ONCE(head->subsys->iopolicy)) {
456 	case NVME_IOPOLICY_QD:
457 		return nvme_queue_depth_path(head);
458 	case NVME_IOPOLICY_RR:
459 		return nvme_round_robin_path(head);
460 	default:
461 		return nvme_numa_path(head);
462 	}
463 }
464 
465 static bool nvme_available_path(struct nvme_ns_head *head)
466 {
467 	struct nvme_ns *ns;
468 
469 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
470 		return false;
471 
472 	list_for_each_entry_srcu(ns, &head->list, siblings,
473 				 srcu_read_lock_held(&head->srcu)) {
474 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
475 			continue;
476 		switch (nvme_ctrl_state(ns->ctrl)) {
477 		case NVME_CTRL_LIVE:
478 		case NVME_CTRL_RESETTING:
479 		case NVME_CTRL_CONNECTING:
480 			return true;
481 		default:
482 			break;
483 		}
484 	}
485 
486 	/*
487 	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
488 	 * not immediately fail I/O. Instead, requeue the I/O for the configured
489 	 * duration, anticipating that if there's a transient link failure then
490 	 * it may recover within this time window. This parameter is exported to
491 	 * userspace via sysfs, and its default value is zero. It is internally
492 	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
493 	 * non-zero, this flag is set to true. When zero, the flag is cleared.
494 	 */
495 	return nvme_mpath_queue_if_no_path(head);
496 }
497 
498 static void nvme_ns_head_submit_bio(struct bio *bio)
499 {
500 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
501 	struct device *dev = disk_to_dev(head->disk);
502 	struct nvme_ns *ns;
503 	int srcu_idx;
504 
505 	/*
506 	 * The namespace might be going away and the bio might be moved to a
507 	 * different queue via blk_steal_bios(), so we need to use the bio_split
508 	 * pool from the original queue to allocate the bvecs from.
509 	 */
510 	bio = bio_split_to_limits(bio);
511 	if (!bio)
512 		return;
513 
514 	srcu_idx = srcu_read_lock(&head->srcu);
515 	ns = nvme_find_path(head);
516 	if (likely(ns)) {
517 		bio_set_dev(bio, ns->disk->part0);
518 		bio->bi_opf |= REQ_NVME_MPATH;
519 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
520 				      bio->bi_iter.bi_sector);
521 		submit_bio_noacct(bio);
522 	} else if (nvme_available_path(head)) {
523 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
524 
525 		spin_lock_irq(&head->requeue_lock);
526 		bio_list_add(&head->requeue_list, bio);
527 		spin_unlock_irq(&head->requeue_lock);
528 	} else {
529 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
530 
531 		bio_io_error(bio);
532 	}
533 
534 	srcu_read_unlock(&head->srcu, srcu_idx);
535 }
536 
537 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
538 {
539 	if (!nvme_tryget_ns_head(disk->private_data))
540 		return -ENXIO;
541 	return 0;
542 }
543 
544 static void nvme_ns_head_release(struct gendisk *disk)
545 {
546 	nvme_put_ns_head(disk->private_data);
547 }
548 
549 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
550 		enum blk_unique_id type)
551 {
552 	struct nvme_ns_head *head = disk->private_data;
553 	struct nvme_ns *ns;
554 	int srcu_idx, ret = -EWOULDBLOCK;
555 
556 	srcu_idx = srcu_read_lock(&head->srcu);
557 	ns = nvme_find_path(head);
558 	if (ns)
559 		ret = nvme_ns_get_unique_id(ns, id, type);
560 	srcu_read_unlock(&head->srcu, srcu_idx);
561 	return ret;
562 }
563 
564 #ifdef CONFIG_BLK_DEV_ZONED
565 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
566 		unsigned int nr_zones, struct blk_report_zones_args *args)
567 {
568 	struct nvme_ns_head *head = disk->private_data;
569 	struct nvme_ns *ns;
570 	int srcu_idx, ret = -EWOULDBLOCK;
571 
572 	srcu_idx = srcu_read_lock(&head->srcu);
573 	ns = nvme_find_path(head);
574 	if (ns)
575 		ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
576 	srcu_read_unlock(&head->srcu, srcu_idx);
577 	return ret;
578 }
579 #else
580 #define nvme_ns_head_report_zones	NULL
581 #endif /* CONFIG_BLK_DEV_ZONED */
582 
583 const struct block_device_operations nvme_ns_head_ops = {
584 	.owner		= THIS_MODULE,
585 	.submit_bio	= nvme_ns_head_submit_bio,
586 	.open		= nvme_ns_head_open,
587 	.release	= nvme_ns_head_release,
588 	.ioctl		= nvme_ns_head_ioctl,
589 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
590 	.getgeo		= nvme_getgeo,
591 	.get_unique_id	= nvme_ns_head_get_unique_id,
592 	.report_zones	= nvme_ns_head_report_zones,
593 	.pr_ops		= &nvme_pr_ops,
594 };
595 
596 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
597 {
598 	return container_of(cdev, struct nvme_ns_head, cdev);
599 }
600 
601 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
602 {
603 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
604 		return -ENXIO;
605 	return 0;
606 }
607 
608 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
609 {
610 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
611 	return 0;
612 }
613 
614 static const struct file_operations nvme_ns_head_chr_fops = {
615 	.owner		= THIS_MODULE,
616 	.open		= nvme_ns_head_chr_open,
617 	.release	= nvme_ns_head_chr_release,
618 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
619 	.compat_ioctl	= compat_ptr_ioctl,
620 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
621 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
622 };
623 
624 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
625 {
626 	int ret;
627 
628 	head->cdev_device.parent = &head->subsys->dev;
629 	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
630 			   head->subsys->instance, head->instance);
631 	if (ret)
632 		return ret;
633 	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
634 			    &nvme_ns_head_chr_fops, THIS_MODULE);
635 	return ret;
636 }
637 
638 static void nvme_partition_scan_work(struct work_struct *work)
639 {
640 	struct nvme_ns_head *head =
641 		container_of(work, struct nvme_ns_head, partition_scan_work);
642 
643 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
644 					     &head->disk->state)))
645 		return;
646 
647 	mutex_lock(&head->disk->open_mutex);
648 	bdev_disk_changed(head->disk, false);
649 	mutex_unlock(&head->disk->open_mutex);
650 }
651 
652 static void nvme_requeue_work(struct work_struct *work)
653 {
654 	struct nvme_ns_head *head =
655 		container_of(work, struct nvme_ns_head, requeue_work);
656 	struct bio *bio, *next;
657 
658 	spin_lock_irq(&head->requeue_lock);
659 	next = bio_list_get(&head->requeue_list);
660 	spin_unlock_irq(&head->requeue_lock);
661 
662 	while ((bio = next) != NULL) {
663 		next = bio->bi_next;
664 		bio->bi_next = NULL;
665 
666 		submit_bio_noacct(bio);
667 	}
668 }
669 
670 static void nvme_remove_head(struct nvme_ns_head *head)
671 {
672 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
673 		/*
674 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
675 		 * to allow multipath to fail all I/O.
676 		 */
677 		kblockd_schedule_work(&head->requeue_work);
678 
679 		nvme_cdev_del(&head->cdev, &head->cdev_device);
680 		synchronize_srcu(&head->srcu);
681 		del_gendisk(head->disk);
682 	}
683 	nvme_put_ns_head(head);
684 }
685 
686 static void nvme_remove_head_work(struct work_struct *work)
687 {
688 	struct nvme_ns_head *head = container_of(to_delayed_work(work),
689 			struct nvme_ns_head, remove_work);
690 	bool remove = false;
691 
692 	mutex_lock(&head->subsys->lock);
693 	if (list_empty(&head->list)) {
694 		list_del_init(&head->entry);
695 		remove = true;
696 	}
697 	mutex_unlock(&head->subsys->lock);
698 	if (remove)
699 		nvme_remove_head(head);
700 
701 	module_put(THIS_MODULE);
702 }
703 
704 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
705 {
706 	struct queue_limits lim;
707 
708 	mutex_init(&head->lock);
709 	bio_list_init(&head->requeue_list);
710 	spin_lock_init(&head->requeue_lock);
711 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
712 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
713 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
714 	head->delayed_removal_secs = 0;
715 
716 	/*
717 	 * If "multipath_always_on" is enabled, a multipath node is added
718 	 * regardless of whether the disk is single/multi ported, and whether
719 	 * the namespace is shared or private. If "multipath_always_on" is not
720 	 * enabled, a multipath node is added only if the subsystem supports
721 	 * multiple controllers and the "multipath" option is configured. In
722 	 * either case, for private namespaces, we ensure that the NSID is
723 	 * unique.
724 	 */
725 	if (!multipath_always_on) {
726 		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
727 				!multipath)
728 			return 0;
729 	}
730 
731 	if (!nvme_is_unique_nsid(ctrl, head))
732 		return 0;
733 
734 	blk_set_stacking_limits(&lim);
735 	lim.dma_alignment = 3;
736 	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
737 		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
738 	if (head->ids.csi == NVME_CSI_ZNS)
739 		lim.features |= BLK_FEAT_ZONED;
740 
741 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
742 	if (IS_ERR(head->disk))
743 		return PTR_ERR(head->disk);
744 	head->disk->fops = &nvme_ns_head_ops;
745 	head->disk->private_data = head;
746 
747 	/*
748 	 * We need to suppress the partition scan from occuring within the
749 	 * controller's scan_work context. If a path error occurs here, the IO
750 	 * will wait until a path becomes available or all paths are torn down,
751 	 * but that action also occurs within scan_work, so it would deadlock.
752 	 * Defer the partition scan to a different context that does not block
753 	 * scan_work.
754 	 */
755 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
756 	sprintf(head->disk->disk_name, "nvme%dn%d",
757 			ctrl->subsys->instance, head->instance);
758 	nvme_tryget_ns_head(head);
759 	return 0;
760 }
761 
762 static void nvme_mpath_set_live(struct nvme_ns *ns)
763 {
764 	struct nvme_ns_head *head = ns->head;
765 	int rc;
766 
767 	if (!head->disk)
768 		return;
769 
770 	/*
771 	 * test_and_set_bit() is used because it is protecting against two nvme
772 	 * paths simultaneously calling device_add_disk() on the same namespace
773 	 * head.
774 	 */
775 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
776 		rc = device_add_disk(&head->subsys->dev, head->disk,
777 				     nvme_ns_attr_groups);
778 		if (rc) {
779 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
780 			return;
781 		}
782 		nvme_add_ns_head_cdev(head);
783 		queue_work(nvme_wq, &head->partition_scan_work);
784 	}
785 
786 	nvme_mpath_add_sysfs_link(ns->head);
787 
788 	mutex_lock(&head->lock);
789 	if (nvme_path_is_optimized(ns)) {
790 		int node, srcu_idx;
791 
792 		srcu_idx = srcu_read_lock(&head->srcu);
793 		for_each_online_node(node)
794 			__nvme_find_path(head, node);
795 		srcu_read_unlock(&head->srcu, srcu_idx);
796 	}
797 	mutex_unlock(&head->lock);
798 
799 	synchronize_srcu(&head->srcu);
800 	kblockd_schedule_work(&head->requeue_work);
801 }
802 
803 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
804 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
805 			void *))
806 {
807 	void *base = ctrl->ana_log_buf;
808 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
809 	int error, i;
810 
811 	lockdep_assert_held(&ctrl->ana_lock);
812 
813 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
814 		struct nvme_ana_group_desc *desc = base + offset;
815 		u32 nr_nsids;
816 		size_t nsid_buf_size;
817 
818 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
819 			return -EINVAL;
820 
821 		nr_nsids = le32_to_cpu(desc->nnsids);
822 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
823 
824 		if (WARN_ON_ONCE(desc->grpid == 0))
825 			return -EINVAL;
826 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
827 			return -EINVAL;
828 		if (WARN_ON_ONCE(desc->state == 0))
829 			return -EINVAL;
830 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
831 			return -EINVAL;
832 
833 		offset += sizeof(*desc);
834 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
835 			return -EINVAL;
836 
837 		error = cb(ctrl, desc, data);
838 		if (error)
839 			return error;
840 
841 		offset += nsid_buf_size;
842 	}
843 
844 	return 0;
845 }
846 
847 static inline bool nvme_state_is_live(enum nvme_ana_state state)
848 {
849 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
850 }
851 
852 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
853 		struct nvme_ns *ns)
854 {
855 	ns->ana_grpid = le32_to_cpu(desc->grpid);
856 	ns->ana_state = desc->state;
857 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
858 	/*
859 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
860 	 * and in turn to this path device.  However we cannot accept this I/O
861 	 * if the controller is not live.  This may deadlock if called from
862 	 * nvme_mpath_init_identify() and the ctrl will never complete
863 	 * initialization, preventing I/O from completing.  For this case we
864 	 * will reprocess the ANA log page in nvme_mpath_update() once the
865 	 * controller is ready.
866 	 */
867 	if (nvme_state_is_live(ns->ana_state) &&
868 	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
869 		nvme_mpath_set_live(ns);
870 	else {
871 		/*
872 		 * Add sysfs link from multipath head gendisk node to path
873 		 * device gendisk node.
874 		 * If path's ana state is live (i.e. state is either optimized
875 		 * or non-optimized) while we alloc the ns then sysfs link would
876 		 * be created from nvme_mpath_set_live(). In that case we would
877 		 * not fallthrough this code path. However for the path's ana
878 		 * state other than live, we call nvme_mpath_set_live() only
879 		 * after ana state transitioned to the live state. But we still
880 		 * want to create the sysfs link from head node to a path device
881 		 * irrespctive of the path's ana state.
882 		 * If we reach through here then it means that path's ana state
883 		 * is not live but still create the sysfs link to this path from
884 		 * head node if head node of the path has already come alive.
885 		 */
886 		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
887 			nvme_mpath_add_sysfs_link(ns->head);
888 	}
889 }
890 
891 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
892 		struct nvme_ana_group_desc *desc, void *data)
893 {
894 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
895 	unsigned *nr_change_groups = data;
896 	struct nvme_ns *ns;
897 	int srcu_idx;
898 
899 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
900 			le32_to_cpu(desc->grpid),
901 			nvme_ana_state_names[desc->state]);
902 
903 	if (desc->state == NVME_ANA_CHANGE)
904 		(*nr_change_groups)++;
905 
906 	if (!nr_nsids)
907 		return 0;
908 
909 	srcu_idx = srcu_read_lock(&ctrl->srcu);
910 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
911 				 srcu_read_lock_held(&ctrl->srcu)) {
912 		unsigned nsid;
913 again:
914 		nsid = le32_to_cpu(desc->nsids[n]);
915 		if (ns->head->ns_id < nsid)
916 			continue;
917 		if (ns->head->ns_id == nsid)
918 			nvme_update_ns_ana_state(desc, ns);
919 		if (++n == nr_nsids)
920 			break;
921 		if (ns->head->ns_id > nsid)
922 			goto again;
923 	}
924 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
925 	return 0;
926 }
927 
928 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
929 {
930 	u32 nr_change_groups = 0;
931 	int error;
932 
933 	mutex_lock(&ctrl->ana_lock);
934 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
935 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
936 	if (error) {
937 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
938 		goto out_unlock;
939 	}
940 
941 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
942 			nvme_update_ana_state);
943 	if (error)
944 		goto out_unlock;
945 
946 	/*
947 	 * In theory we should have an ANATT timer per group as they might enter
948 	 * the change state at different times.  But that is a lot of overhead
949 	 * just to protect against a target that keeps entering new changes
950 	 * states while never finishing previous ones.  But we'll still
951 	 * eventually time out once all groups are in change state, so this
952 	 * isn't a big deal.
953 	 *
954 	 * We also double the ANATT value to provide some slack for transports
955 	 * or AEN processing overhead.
956 	 */
957 	if (nr_change_groups)
958 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
959 	else
960 		timer_delete_sync(&ctrl->anatt_timer);
961 out_unlock:
962 	mutex_unlock(&ctrl->ana_lock);
963 	return error;
964 }
965 
966 static void nvme_ana_work(struct work_struct *work)
967 {
968 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
969 
970 	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
971 		return;
972 
973 	nvme_read_ana_log(ctrl);
974 }
975 
976 void nvme_mpath_update(struct nvme_ctrl *ctrl)
977 {
978 	u32 nr_change_groups = 0;
979 
980 	if (!ctrl->ana_log_buf)
981 		return;
982 
983 	mutex_lock(&ctrl->ana_lock);
984 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
985 	mutex_unlock(&ctrl->ana_lock);
986 }
987 
988 static void nvme_anatt_timeout(struct timer_list *t)
989 {
990 	struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
991 
992 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
993 	nvme_reset_ctrl(ctrl);
994 }
995 
996 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
997 {
998 	if (!nvme_ctrl_use_ana(ctrl))
999 		return;
1000 	timer_delete_sync(&ctrl->anatt_timer);
1001 	cancel_work_sync(&ctrl->ana_work);
1002 }
1003 
1004 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
1005 	struct device_attribute subsys_attr_##_name =	\
1006 		__ATTR(_name, _mode, _show, _store)
1007 
1008 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1009 		struct device_attribute *attr, char *buf)
1010 {
1011 	struct nvme_subsystem *subsys =
1012 		container_of(dev, struct nvme_subsystem, dev);
1013 
1014 	return sysfs_emit(buf, "%s\n",
1015 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1016 }
1017 
1018 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1019 		int iopolicy)
1020 {
1021 	struct nvme_ctrl *ctrl;
1022 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
1023 
1024 	if (old_iopolicy == iopolicy)
1025 		return;
1026 
1027 	WRITE_ONCE(subsys->iopolicy, iopolicy);
1028 
1029 	/* iopolicy changes clear the mpath by design */
1030 	mutex_lock(&nvme_subsystems_lock);
1031 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1032 		nvme_mpath_clear_ctrl_paths(ctrl);
1033 	mutex_unlock(&nvme_subsystems_lock);
1034 
1035 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1036 			subsys->subnqn,
1037 			nvme_iopolicy_names[old_iopolicy],
1038 			nvme_iopolicy_names[iopolicy]);
1039 }
1040 
1041 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1042 		struct device_attribute *attr, const char *buf, size_t count)
1043 {
1044 	struct nvme_subsystem *subsys =
1045 		container_of(dev, struct nvme_subsystem, dev);
1046 	int i;
1047 
1048 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
1049 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
1050 			nvme_subsys_iopolicy_update(subsys, i);
1051 			return count;
1052 		}
1053 	}
1054 
1055 	return -EINVAL;
1056 }
1057 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1058 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1059 
1060 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1061 		char *buf)
1062 {
1063 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1064 }
1065 DEVICE_ATTR_RO(ana_grpid);
1066 
1067 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1068 		char *buf)
1069 {
1070 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1071 
1072 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1073 }
1074 DEVICE_ATTR_RO(ana_state);
1075 
1076 static ssize_t queue_depth_show(struct device *dev,
1077 		struct device_attribute *attr, char *buf)
1078 {
1079 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1080 
1081 	if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1082 		return 0;
1083 
1084 	return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1085 }
1086 DEVICE_ATTR_RO(queue_depth);
1087 
1088 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1089 		char *buf)
1090 {
1091 	int node, srcu_idx;
1092 	nodemask_t numa_nodes;
1093 	struct nvme_ns *current_ns;
1094 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1095 	struct nvme_ns_head *head = ns->head;
1096 
1097 	if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1098 		return 0;
1099 
1100 	nodes_clear(numa_nodes);
1101 
1102 	srcu_idx = srcu_read_lock(&head->srcu);
1103 	for_each_node(node) {
1104 		current_ns = srcu_dereference(head->current_path[node],
1105 				&head->srcu);
1106 		if (ns == current_ns)
1107 			node_set(node, numa_nodes);
1108 	}
1109 	srcu_read_unlock(&head->srcu, srcu_idx);
1110 
1111 	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1112 }
1113 DEVICE_ATTR_RO(numa_nodes);
1114 
1115 static ssize_t delayed_removal_secs_show(struct device *dev,
1116 		struct device_attribute *attr, char *buf)
1117 {
1118 	struct gendisk *disk = dev_to_disk(dev);
1119 	struct nvme_ns_head *head = disk->private_data;
1120 	int ret;
1121 
1122 	mutex_lock(&head->subsys->lock);
1123 	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1124 	mutex_unlock(&head->subsys->lock);
1125 	return ret;
1126 }
1127 
1128 static ssize_t delayed_removal_secs_store(struct device *dev,
1129 		struct device_attribute *attr, const char *buf, size_t count)
1130 {
1131 	struct gendisk *disk = dev_to_disk(dev);
1132 	struct nvme_ns_head *head = disk->private_data;
1133 	unsigned int sec;
1134 	int ret;
1135 
1136 	ret = kstrtouint(buf, 0, &sec);
1137 	if (ret < 0)
1138 		return ret;
1139 
1140 	mutex_lock(&head->subsys->lock);
1141 	head->delayed_removal_secs = sec;
1142 	if (sec)
1143 		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1144 	else
1145 		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1146 	mutex_unlock(&head->subsys->lock);
1147 	/*
1148 	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1149 	 * by its reader.
1150 	 */
1151 	synchronize_srcu(&head->srcu);
1152 
1153 	return count;
1154 }
1155 
1156 DEVICE_ATTR_RW(delayed_removal_secs);
1157 
1158 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1159 		struct nvme_ana_group_desc *desc, void *data)
1160 {
1161 	struct nvme_ana_group_desc *dst = data;
1162 
1163 	if (desc->grpid != dst->grpid)
1164 		return 0;
1165 
1166 	*dst = *desc;
1167 	return -ENXIO; /* just break out of the loop */
1168 }
1169 
1170 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1171 {
1172 	struct device *target;
1173 	int rc, srcu_idx;
1174 	struct nvme_ns *ns;
1175 	struct kobject *kobj;
1176 
1177 	/*
1178 	 * Ensure head disk node is already added otherwise we may get invalid
1179 	 * kobj for head disk node
1180 	 */
1181 	if (!test_bit(GD_ADDED, &head->disk->state))
1182 		return;
1183 
1184 	kobj = &disk_to_dev(head->disk)->kobj;
1185 
1186 	/*
1187 	 * loop through each ns chained through the head->list and create the
1188 	 * sysfs link from head node to the ns path node
1189 	 */
1190 	srcu_idx = srcu_read_lock(&head->srcu);
1191 
1192 	list_for_each_entry_srcu(ns, &head->list, siblings,
1193 				 srcu_read_lock_held(&head->srcu)) {
1194 		/*
1195 		 * Ensure that ns path disk node is already added otherwise we
1196 		 * may get invalid kobj name for target
1197 		 */
1198 		if (!test_bit(GD_ADDED, &ns->disk->state))
1199 			continue;
1200 
1201 		/*
1202 		 * Avoid creating link if it already exists for the given path.
1203 		 * When path ana state transitions from optimized to non-
1204 		 * optimized or vice-versa, the nvme_mpath_set_live() is
1205 		 * invoked which in truns call this function. Now if the sysfs
1206 		 * link already exists for the given path and we attempt to re-
1207 		 * create the link then sysfs code would warn about it loudly.
1208 		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1209 		 * that we're not creating duplicate link.
1210 		 * The test_and_set_bit() is used because it is protecting
1211 		 * against multiple nvme paths being simultaneously added.
1212 		 */
1213 		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1214 			continue;
1215 
1216 		target = disk_to_dev(ns->disk);
1217 		/*
1218 		 * Create sysfs link from head gendisk kobject @kobj to the
1219 		 * ns path gendisk kobject @target->kobj.
1220 		 */
1221 		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1222 				&target->kobj, dev_name(target));
1223 		if (unlikely(rc)) {
1224 			dev_err(disk_to_dev(ns->head->disk),
1225 					"failed to create link to %s\n",
1226 					dev_name(target));
1227 			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1228 		}
1229 	}
1230 
1231 	srcu_read_unlock(&head->srcu, srcu_idx);
1232 }
1233 
1234 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1235 {
1236 	struct device *target;
1237 	struct kobject *kobj;
1238 
1239 	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1240 		return;
1241 
1242 	target = disk_to_dev(ns->disk);
1243 	kobj = &disk_to_dev(ns->head->disk)->kobj;
1244 	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1245 			dev_name(target));
1246 	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1247 }
1248 
1249 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1250 {
1251 	if (nvme_ctrl_use_ana(ns->ctrl)) {
1252 		struct nvme_ana_group_desc desc = {
1253 			.grpid = anagrpid,
1254 			.state = 0,
1255 		};
1256 
1257 		mutex_lock(&ns->ctrl->ana_lock);
1258 		ns->ana_grpid = le32_to_cpu(anagrpid);
1259 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1260 		mutex_unlock(&ns->ctrl->ana_lock);
1261 		if (desc.state) {
1262 			/* found the group desc: update */
1263 			nvme_update_ns_ana_state(&desc, ns);
1264 		} else {
1265 			/* group desc not found: trigger a re-read */
1266 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1267 			queue_work(nvme_wq, &ns->ctrl->ana_work);
1268 		}
1269 	} else {
1270 		ns->ana_state = NVME_ANA_OPTIMIZED;
1271 		nvme_mpath_set_live(ns);
1272 	}
1273 
1274 #ifdef CONFIG_BLK_DEV_ZONED
1275 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1276 		ns->head->disk->nr_zones = ns->disk->nr_zones;
1277 #endif
1278 }
1279 
1280 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1281 {
1282 	bool remove = false;
1283 
1284 	if (!head->disk)
1285 		return;
1286 
1287 	mutex_lock(&head->subsys->lock);
1288 	/*
1289 	 * We are called when all paths have been removed, and at that point
1290 	 * head->list is expected to be empty. However, nvme_ns_remove() and
1291 	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1292 	 * removal_secs is configured, it is possible that by the time we reach
1293 	 * this point, head->list may no longer be empty. Therefore, we recheck
1294 	 * head->list here. If it is no longer empty then we skip enqueuing the
1295 	 * delayed head removal work.
1296 	 */
1297 	if (!list_empty(&head->list))
1298 		goto out;
1299 
1300 	/*
1301 	 * Ensure that no one could remove this module while the head
1302 	 * remove work is pending.
1303 	 */
1304 	if (head->delayed_removal_secs && try_module_get(THIS_MODULE)) {
1305 		mod_delayed_work(nvme_wq, &head->remove_work,
1306 				head->delayed_removal_secs * HZ);
1307 	} else {
1308 		list_del_init(&head->entry);
1309 		remove = true;
1310 	}
1311 out:
1312 	mutex_unlock(&head->subsys->lock);
1313 	if (remove)
1314 		nvme_remove_head(head);
1315 }
1316 
1317 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1318 {
1319 	if (!head->disk)
1320 		return;
1321 	/* make sure all pending bios are cleaned up */
1322 	kblockd_schedule_work(&head->requeue_work);
1323 	flush_work(&head->requeue_work);
1324 	flush_work(&head->partition_scan_work);
1325 	put_disk(head->disk);
1326 }
1327 
1328 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1329 {
1330 	mutex_init(&ctrl->ana_lock);
1331 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1332 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1333 }
1334 
1335 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1336 {
1337 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1338 	size_t ana_log_size;
1339 	int error = 0;
1340 
1341 	/* check if multipath is enabled and we have the capability */
1342 	if (!multipath || !ctrl->subsys ||
1343 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1344 		return 0;
1345 
1346 	/* initialize this in the identify path to cover controller resets */
1347 	atomic_set(&ctrl->nr_active, 0);
1348 
1349 	if (!ctrl->max_namespaces ||
1350 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1351 		dev_err(ctrl->device,
1352 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1353 		return -EINVAL;
1354 	}
1355 
1356 	ctrl->anacap = id->anacap;
1357 	ctrl->anatt = id->anatt;
1358 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1359 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1360 
1361 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1362 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1363 		ctrl->max_namespaces * sizeof(__le32);
1364 	if (ana_log_size > max_transfer_size) {
1365 		dev_err(ctrl->device,
1366 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
1367 			ana_log_size, max_transfer_size);
1368 		dev_err(ctrl->device, "disabling ANA support.\n");
1369 		goto out_uninit;
1370 	}
1371 	if (ana_log_size > ctrl->ana_log_size) {
1372 		nvme_mpath_stop(ctrl);
1373 		nvme_mpath_uninit(ctrl);
1374 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1375 		if (!ctrl->ana_log_buf)
1376 			return -ENOMEM;
1377 	}
1378 	ctrl->ana_log_size = ana_log_size;
1379 	error = nvme_read_ana_log(ctrl);
1380 	if (error)
1381 		goto out_uninit;
1382 	return 0;
1383 
1384 out_uninit:
1385 	nvme_mpath_uninit(ctrl);
1386 	return error;
1387 }
1388 
1389 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1390 {
1391 	kvfree(ctrl->ana_log_buf);
1392 	ctrl->ana_log_buf = NULL;
1393 	ctrl->ana_log_size = 0;
1394 }
1395