xref: /linux/drivers/nvme/host/multipath.c (revision 7d435caacd91d23ebba281c4aac859196e1e2938)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2018 Christoph Hellwig.
4  */
5 
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11 
12 bool multipath = true;
13 static bool multipath_always_on;
14 
15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 	int ret;
18 	bool *arg = kp->arg;
19 
20 	ret = param_set_bool(val, kp);
21 	if (ret)
22 		return ret;
23 
24 	if (multipath_always_on && !*arg) {
25 		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 		*arg = true;
27 		return -EINVAL;
28 	}
29 
30 	return 0;
31 }
32 
33 static const struct kernel_param_ops multipath_param_ops = {
34 	.set = multipath_param_set,
35 	.get = param_get_bool,
36 };
37 
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 	"turn on native support for multiple controllers per subsystem");
41 
42 static int multipath_always_on_set(const char *val,
43 		const struct kernel_param *kp)
44 {
45 	int ret;
46 	bool *arg = kp->arg;
47 
48 	ret = param_set_bool(val, kp);
49 	if (ret < 0)
50 		return ret;
51 
52 	if (*arg)
53 		multipath = true;
54 
55 	return 0;
56 }
57 
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 	.set = multipath_always_on_set,
60 	.get = param_get_bool,
61 };
62 
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 		&multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67 
68 static const char *nvme_iopolicy_names[] = {
69 	[NVME_IOPOLICY_NUMA]	= "numa",
70 	[NVME_IOPOLICY_RR]	= "round-robin",
71 	[NVME_IOPOLICY_QD]      = "queue-depth",
72 };
73 
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75 
76 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
77 {
78 	if (!val)
79 		return -EINVAL;
80 	if (!strncmp(val, "numa", 4))
81 		iopolicy = NVME_IOPOLICY_NUMA;
82 	else if (!strncmp(val, "round-robin", 11))
83 		iopolicy = NVME_IOPOLICY_RR;
84 	else if (!strncmp(val, "queue-depth", 11))
85 		iopolicy = NVME_IOPOLICY_QD;
86 	else
87 		return -EINVAL;
88 
89 	return 0;
90 }
91 
92 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
93 {
94 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
95 }
96 
97 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
98 	&iopolicy, 0644);
99 MODULE_PARM_DESC(iopolicy,
100 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
101 
102 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
103 {
104 	subsys->iopolicy = iopolicy;
105 }
106 
107 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
108 {
109 	struct nvme_ns_head *h;
110 
111 	lockdep_assert_held(&subsys->lock);
112 	list_for_each_entry(h, &subsys->nsheads, entry)
113 		if (h->disk)
114 			blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
115 }
116 
117 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
118 {
119 	struct nvme_ns_head *h;
120 
121 	lockdep_assert_held(&subsys->lock);
122 	list_for_each_entry(h, &subsys->nsheads, entry)
123 		if (h->disk)
124 			blk_mq_freeze_queue_wait(h->disk->queue);
125 }
126 
127 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
128 {
129 	struct nvme_ns_head *h;
130 
131 	lockdep_assert_held(&subsys->lock);
132 	list_for_each_entry(h, &subsys->nsheads, entry)
133 		if (h->disk)
134 			blk_freeze_queue_start(h->disk->queue);
135 }
136 
137 void nvme_failover_req(struct request *req)
138 {
139 	struct nvme_ns *ns = req->q->queuedata;
140 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
141 	unsigned long flags;
142 	struct bio *bio;
143 
144 	nvme_mpath_clear_current_path(ns);
145 
146 	/*
147 	 * If we got back an ANA error, we know the controller is alive but not
148 	 * ready to serve this namespace.  Kick of a re-read of the ANA
149 	 * information page, and just try any other available path for now.
150 	 */
151 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
152 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
153 		queue_work(nvme_wq, &ns->ctrl->ana_work);
154 	}
155 
156 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
157 	for (bio = req->bio; bio; bio = bio->bi_next)
158 		bio_set_dev(bio, ns->head->disk->part0);
159 	blk_steal_bios(&ns->head->requeue_list, req);
160 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
161 
162 	nvme_req(req)->status = 0;
163 	nvme_end_req(req);
164 	kblockd_schedule_work(&ns->head->requeue_work);
165 }
166 
167 void nvme_mpath_start_request(struct request *rq)
168 {
169 	struct nvme_ns *ns = rq->q->queuedata;
170 	struct gendisk *disk = ns->head->disk;
171 
172 	if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
173 	    !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
174 		atomic_inc(&ns->ctrl->nr_active);
175 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
176 	}
177 
178 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) ||
179 	    (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
180 		return;
181 
182 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
183 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
184 						      jiffies);
185 }
186 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
187 
188 void nvme_mpath_end_request(struct request *rq)
189 {
190 	struct nvme_ns *ns = rq->q->queuedata;
191 
192 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
193 		atomic_dec_if_positive(&ns->ctrl->nr_active);
194 
195 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
196 		return;
197 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
198 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
199 			 nvme_req(rq)->start_time);
200 }
201 
202 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
203 {
204 	struct nvme_ns *ns;
205 	int srcu_idx;
206 
207 	srcu_idx = srcu_read_lock(&ctrl->srcu);
208 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
209 				 srcu_read_lock_held(&ctrl->srcu)) {
210 		if (!ns->head->disk)
211 			continue;
212 		kblockd_schedule_work(&ns->head->requeue_work);
213 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
214 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
215 	}
216 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
217 }
218 
219 static const char *nvme_ana_state_names[] = {
220 	[0]				= "invalid state",
221 	[NVME_ANA_OPTIMIZED]		= "optimized",
222 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
223 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
224 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
225 	[NVME_ANA_CHANGE]		= "change",
226 };
227 
228 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
229 {
230 	struct nvme_ns_head *head = ns->head;
231 	bool changed = false;
232 	int node;
233 
234 	for_each_node(node) {
235 		if (ns == rcu_access_pointer(head->current_path[node])) {
236 			rcu_assign_pointer(head->current_path[node], NULL);
237 			changed = true;
238 		}
239 	}
240 	return changed;
241 }
242 
243 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
244 {
245 	struct nvme_ns *ns;
246 	int srcu_idx;
247 
248 	srcu_idx = srcu_read_lock(&ctrl->srcu);
249 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
250 				 srcu_read_lock_held(&ctrl->srcu)) {
251 		nvme_mpath_clear_current_path(ns);
252 		kblockd_schedule_work(&ns->head->requeue_work);
253 	}
254 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
255 }
256 
257 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
258 {
259 	struct nvme_ns_head *head = ns->head;
260 	sector_t capacity = get_capacity(head->disk);
261 	int node;
262 	int srcu_idx;
263 
264 	srcu_idx = srcu_read_lock(&head->srcu);
265 	list_for_each_entry_srcu(ns, &head->list, siblings,
266 				 srcu_read_lock_held(&head->srcu)) {
267 		if (capacity != get_capacity(ns->disk))
268 			clear_bit(NVME_NS_READY, &ns->flags);
269 	}
270 	srcu_read_unlock(&head->srcu, srcu_idx);
271 
272 	for_each_node(node)
273 		rcu_assign_pointer(head->current_path[node], NULL);
274 	kblockd_schedule_work(&head->requeue_work);
275 }
276 
277 static bool nvme_path_is_disabled(struct nvme_ns *ns)
278 {
279 	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
280 
281 	/*
282 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
283 	 * still be able to complete assuming that the controller is connected.
284 	 * Otherwise it will fail immediately and return to the requeue list.
285 	 */
286 	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
287 		return true;
288 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
289 	    !test_bit(NVME_NS_READY, &ns->flags))
290 		return true;
291 	return false;
292 }
293 
294 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
295 {
296 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
297 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
298 
299 	list_for_each_entry_srcu(ns, &head->list, siblings,
300 				 srcu_read_lock_held(&head->srcu)) {
301 		if (nvme_path_is_disabled(ns))
302 			continue;
303 
304 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
305 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
306 			distance = node_distance(node, ns->ctrl->numa_node);
307 		else
308 			distance = LOCAL_DISTANCE;
309 
310 		switch (ns->ana_state) {
311 		case NVME_ANA_OPTIMIZED:
312 			if (distance < found_distance) {
313 				found_distance = distance;
314 				found = ns;
315 			}
316 			break;
317 		case NVME_ANA_NONOPTIMIZED:
318 			if (distance < fallback_distance) {
319 				fallback_distance = distance;
320 				fallback = ns;
321 			}
322 			break;
323 		default:
324 			break;
325 		}
326 	}
327 
328 	if (!found)
329 		found = fallback;
330 	if (found)
331 		rcu_assign_pointer(head->current_path[node], found);
332 	return found;
333 }
334 
335 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
336 		struct nvme_ns *ns)
337 {
338 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
339 			siblings);
340 	if (ns)
341 		return ns;
342 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
343 }
344 
345 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
346 {
347 	struct nvme_ns *ns, *found = NULL;
348 	int node = numa_node_id();
349 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
350 					       &head->srcu);
351 
352 	if (unlikely(!old))
353 		return __nvme_find_path(head, node);
354 
355 	if (list_is_singular(&head->list)) {
356 		if (nvme_path_is_disabled(old))
357 			return NULL;
358 		return old;
359 	}
360 
361 	for (ns = nvme_next_ns(head, old);
362 	     ns && ns != old;
363 	     ns = nvme_next_ns(head, ns)) {
364 		if (nvme_path_is_disabled(ns))
365 			continue;
366 
367 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
368 			found = ns;
369 			goto out;
370 		}
371 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
372 			found = ns;
373 	}
374 
375 	/*
376 	 * The loop above skips the current path for round-robin semantics.
377 	 * Fall back to the current path if either:
378 	 *  - no other optimized path found and current is optimized,
379 	 *  - no other usable path found and current is usable.
380 	 */
381 	if (!nvme_path_is_disabled(old) &&
382 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
383 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
384 		return old;
385 
386 	if (!found)
387 		return NULL;
388 out:
389 	rcu_assign_pointer(head->current_path[node], found);
390 	return found;
391 }
392 
393 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
394 {
395 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
396 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
397 	unsigned int depth;
398 
399 	list_for_each_entry_srcu(ns, &head->list, siblings,
400 				 srcu_read_lock_held(&head->srcu)) {
401 		if (nvme_path_is_disabled(ns))
402 			continue;
403 
404 		depth = atomic_read(&ns->ctrl->nr_active);
405 
406 		switch (ns->ana_state) {
407 		case NVME_ANA_OPTIMIZED:
408 			if (depth < min_depth_opt) {
409 				min_depth_opt = depth;
410 				best_opt = ns;
411 			}
412 			break;
413 		case NVME_ANA_NONOPTIMIZED:
414 			if (depth < min_depth_nonopt) {
415 				min_depth_nonopt = depth;
416 				best_nonopt = ns;
417 			}
418 			break;
419 		default:
420 			break;
421 		}
422 
423 		if (min_depth_opt == 0)
424 			return best_opt;
425 	}
426 
427 	return best_opt ? best_opt : best_nonopt;
428 }
429 
430 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
431 {
432 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
433 		ns->ana_state == NVME_ANA_OPTIMIZED;
434 }
435 
436 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
437 {
438 	int node = numa_node_id();
439 	struct nvme_ns *ns;
440 
441 	ns = srcu_dereference(head->current_path[node], &head->srcu);
442 	if (unlikely(!ns))
443 		return __nvme_find_path(head, node);
444 	if (unlikely(!nvme_path_is_optimized(ns)))
445 		return __nvme_find_path(head, node);
446 	return ns;
447 }
448 
449 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
450 {
451 	switch (READ_ONCE(head->subsys->iopolicy)) {
452 	case NVME_IOPOLICY_QD:
453 		return nvme_queue_depth_path(head);
454 	case NVME_IOPOLICY_RR:
455 		return nvme_round_robin_path(head);
456 	default:
457 		return nvme_numa_path(head);
458 	}
459 }
460 
461 static bool nvme_available_path(struct nvme_ns_head *head)
462 {
463 	struct nvme_ns *ns;
464 
465 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
466 		return false;
467 
468 	list_for_each_entry_srcu(ns, &head->list, siblings,
469 				 srcu_read_lock_held(&head->srcu)) {
470 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
471 			continue;
472 		switch (nvme_ctrl_state(ns->ctrl)) {
473 		case NVME_CTRL_LIVE:
474 		case NVME_CTRL_RESETTING:
475 		case NVME_CTRL_CONNECTING:
476 			return true;
477 		default:
478 			break;
479 		}
480 	}
481 
482 	/*
483 	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
484 	 * not immediately fail I/O. Instead, requeue the I/O for the configured
485 	 * duration, anticipating that if there's a transient link failure then
486 	 * it may recover within this time window. This parameter is exported to
487 	 * userspace via sysfs, and its default value is zero. It is internally
488 	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
489 	 * non-zero, this flag is set to true. When zero, the flag is cleared.
490 	 */
491 	return nvme_mpath_queue_if_no_path(head);
492 }
493 
494 static void nvme_ns_head_submit_bio(struct bio *bio)
495 {
496 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
497 	struct device *dev = disk_to_dev(head->disk);
498 	struct nvme_ns *ns;
499 	int srcu_idx;
500 
501 	/*
502 	 * The namespace might be going away and the bio might be moved to a
503 	 * different queue via blk_steal_bios(), so we need to use the bio_split
504 	 * pool from the original queue to allocate the bvecs from.
505 	 */
506 	bio = bio_split_to_limits(bio);
507 	if (!bio)
508 		return;
509 
510 	srcu_idx = srcu_read_lock(&head->srcu);
511 	ns = nvme_find_path(head);
512 	if (likely(ns)) {
513 		bio_set_dev(bio, ns->disk->part0);
514 		bio->bi_opf |= REQ_NVME_MPATH;
515 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
516 				      bio->bi_iter.bi_sector);
517 		submit_bio_noacct(bio);
518 	} else if (nvme_available_path(head)) {
519 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
520 
521 		spin_lock_irq(&head->requeue_lock);
522 		bio_list_add(&head->requeue_list, bio);
523 		spin_unlock_irq(&head->requeue_lock);
524 	} else {
525 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
526 
527 		bio_io_error(bio);
528 	}
529 
530 	srcu_read_unlock(&head->srcu, srcu_idx);
531 }
532 
533 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
534 {
535 	if (!nvme_tryget_ns_head(disk->private_data))
536 		return -ENXIO;
537 	return 0;
538 }
539 
540 static void nvme_ns_head_release(struct gendisk *disk)
541 {
542 	nvme_put_ns_head(disk->private_data);
543 }
544 
545 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
546 		enum blk_unique_id type)
547 {
548 	struct nvme_ns_head *head = disk->private_data;
549 	struct nvme_ns *ns;
550 	int srcu_idx, ret = -EWOULDBLOCK;
551 
552 	srcu_idx = srcu_read_lock(&head->srcu);
553 	ns = nvme_find_path(head);
554 	if (ns)
555 		ret = nvme_ns_get_unique_id(ns, id, type);
556 	srcu_read_unlock(&head->srcu, srcu_idx);
557 	return ret;
558 }
559 
560 #ifdef CONFIG_BLK_DEV_ZONED
561 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
562 		unsigned int nr_zones, struct blk_report_zones_args *args)
563 {
564 	struct nvme_ns_head *head = disk->private_data;
565 	struct nvme_ns *ns;
566 	int srcu_idx, ret = -EWOULDBLOCK;
567 
568 	srcu_idx = srcu_read_lock(&head->srcu);
569 	ns = nvme_find_path(head);
570 	if (ns)
571 		ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
572 	srcu_read_unlock(&head->srcu, srcu_idx);
573 	return ret;
574 }
575 #else
576 #define nvme_ns_head_report_zones	NULL
577 #endif /* CONFIG_BLK_DEV_ZONED */
578 
579 const struct block_device_operations nvme_ns_head_ops = {
580 	.owner		= THIS_MODULE,
581 	.submit_bio	= nvme_ns_head_submit_bio,
582 	.open		= nvme_ns_head_open,
583 	.release	= nvme_ns_head_release,
584 	.ioctl		= nvme_ns_head_ioctl,
585 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
586 	.getgeo		= nvme_getgeo,
587 	.get_unique_id	= nvme_ns_head_get_unique_id,
588 	.report_zones	= nvme_ns_head_report_zones,
589 	.pr_ops		= &nvme_pr_ops,
590 };
591 
592 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
593 {
594 	return container_of(cdev, struct nvme_ns_head, cdev);
595 }
596 
597 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
598 {
599 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
600 		return -ENXIO;
601 	return 0;
602 }
603 
604 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
605 {
606 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
607 	return 0;
608 }
609 
610 static const struct file_operations nvme_ns_head_chr_fops = {
611 	.owner		= THIS_MODULE,
612 	.open		= nvme_ns_head_chr_open,
613 	.release	= nvme_ns_head_chr_release,
614 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
615 	.compat_ioctl	= compat_ptr_ioctl,
616 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
617 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
618 };
619 
620 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
621 {
622 	int ret;
623 
624 	head->cdev_device.parent = &head->subsys->dev;
625 	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
626 			   head->subsys->instance, head->instance);
627 	if (ret)
628 		return ret;
629 	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
630 			    &nvme_ns_head_chr_fops, THIS_MODULE);
631 	return ret;
632 }
633 
634 static void nvme_partition_scan_work(struct work_struct *work)
635 {
636 	struct nvme_ns_head *head =
637 		container_of(work, struct nvme_ns_head, partition_scan_work);
638 
639 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
640 					     &head->disk->state)))
641 		return;
642 
643 	mutex_lock(&head->disk->open_mutex);
644 	bdev_disk_changed(head->disk, false);
645 	mutex_unlock(&head->disk->open_mutex);
646 }
647 
648 static void nvme_requeue_work(struct work_struct *work)
649 {
650 	struct nvme_ns_head *head =
651 		container_of(work, struct nvme_ns_head, requeue_work);
652 	struct bio *bio, *next;
653 
654 	spin_lock_irq(&head->requeue_lock);
655 	next = bio_list_get(&head->requeue_list);
656 	spin_unlock_irq(&head->requeue_lock);
657 
658 	while ((bio = next) != NULL) {
659 		next = bio->bi_next;
660 		bio->bi_next = NULL;
661 
662 		submit_bio_noacct(bio);
663 	}
664 }
665 
666 static void nvme_remove_head(struct nvme_ns_head *head)
667 {
668 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
669 		/*
670 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
671 		 * to allow multipath to fail all I/O.
672 		 */
673 		kblockd_schedule_work(&head->requeue_work);
674 
675 		nvme_cdev_del(&head->cdev, &head->cdev_device);
676 		synchronize_srcu(&head->srcu);
677 		del_gendisk(head->disk);
678 	}
679 	nvme_put_ns_head(head);
680 }
681 
682 static void nvme_remove_head_work(struct work_struct *work)
683 {
684 	struct nvme_ns_head *head = container_of(to_delayed_work(work),
685 			struct nvme_ns_head, remove_work);
686 	bool remove = false;
687 
688 	mutex_lock(&head->subsys->lock);
689 	if (list_empty(&head->list)) {
690 		list_del_init(&head->entry);
691 		remove = true;
692 	}
693 	mutex_unlock(&head->subsys->lock);
694 	if (remove)
695 		nvme_remove_head(head);
696 
697 	module_put(THIS_MODULE);
698 }
699 
700 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
701 {
702 	struct queue_limits lim;
703 
704 	mutex_init(&head->lock);
705 	bio_list_init(&head->requeue_list);
706 	spin_lock_init(&head->requeue_lock);
707 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
708 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
709 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
710 	head->delayed_removal_secs = 0;
711 
712 	/*
713 	 * If "multipath_always_on" is enabled, a multipath node is added
714 	 * regardless of whether the disk is single/multi ported, and whether
715 	 * the namespace is shared or private. If "multipath_always_on" is not
716 	 * enabled, a multipath node is added only if the subsystem supports
717 	 * multiple controllers and the "multipath" option is configured. In
718 	 * either case, for private namespaces, we ensure that the NSID is
719 	 * unique.
720 	 */
721 	if (!multipath_always_on) {
722 		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
723 				!multipath)
724 			return 0;
725 	}
726 
727 	if (!nvme_is_unique_nsid(ctrl, head))
728 		return 0;
729 
730 	blk_set_stacking_limits(&lim);
731 	lim.dma_alignment = 3;
732 	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
733 		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
734 	if (head->ids.csi == NVME_CSI_ZNS)
735 		lim.features |= BLK_FEAT_ZONED;
736 
737 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
738 	if (IS_ERR(head->disk))
739 		return PTR_ERR(head->disk);
740 	head->disk->fops = &nvme_ns_head_ops;
741 	head->disk->private_data = head;
742 
743 	/*
744 	 * We need to suppress the partition scan from occuring within the
745 	 * controller's scan_work context. If a path error occurs here, the IO
746 	 * will wait until a path becomes available or all paths are torn down,
747 	 * but that action also occurs within scan_work, so it would deadlock.
748 	 * Defer the partition scan to a different context that does not block
749 	 * scan_work.
750 	 */
751 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
752 	sprintf(head->disk->disk_name, "nvme%dn%d",
753 			ctrl->subsys->instance, head->instance);
754 	nvme_tryget_ns_head(head);
755 	return 0;
756 }
757 
758 static void nvme_mpath_set_live(struct nvme_ns *ns)
759 {
760 	struct nvme_ns_head *head = ns->head;
761 	int rc;
762 
763 	if (!head->disk)
764 		return;
765 
766 	/*
767 	 * test_and_set_bit() is used because it is protecting against two nvme
768 	 * paths simultaneously calling device_add_disk() on the same namespace
769 	 * head.
770 	 */
771 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
772 		rc = device_add_disk(&head->subsys->dev, head->disk,
773 				     nvme_ns_attr_groups);
774 		if (rc) {
775 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
776 			return;
777 		}
778 		nvme_add_ns_head_cdev(head);
779 		queue_work(nvme_wq, &head->partition_scan_work);
780 	}
781 
782 	nvme_mpath_add_sysfs_link(ns->head);
783 
784 	mutex_lock(&head->lock);
785 	if (nvme_path_is_optimized(ns)) {
786 		int node, srcu_idx;
787 
788 		srcu_idx = srcu_read_lock(&head->srcu);
789 		for_each_online_node(node)
790 			__nvme_find_path(head, node);
791 		srcu_read_unlock(&head->srcu, srcu_idx);
792 	}
793 	mutex_unlock(&head->lock);
794 
795 	synchronize_srcu(&head->srcu);
796 	kblockd_schedule_work(&head->requeue_work);
797 }
798 
799 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
800 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
801 			void *))
802 {
803 	void *base = ctrl->ana_log_buf;
804 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
805 	int error, i;
806 
807 	lockdep_assert_held(&ctrl->ana_lock);
808 
809 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
810 		struct nvme_ana_group_desc *desc = base + offset;
811 		u32 nr_nsids;
812 		size_t nsid_buf_size;
813 
814 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
815 			return -EINVAL;
816 
817 		nr_nsids = le32_to_cpu(desc->nnsids);
818 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
819 
820 		if (WARN_ON_ONCE(desc->grpid == 0))
821 			return -EINVAL;
822 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
823 			return -EINVAL;
824 		if (WARN_ON_ONCE(desc->state == 0))
825 			return -EINVAL;
826 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
827 			return -EINVAL;
828 
829 		offset += sizeof(*desc);
830 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
831 			return -EINVAL;
832 
833 		error = cb(ctrl, desc, data);
834 		if (error)
835 			return error;
836 
837 		offset += nsid_buf_size;
838 	}
839 
840 	return 0;
841 }
842 
843 static inline bool nvme_state_is_live(enum nvme_ana_state state)
844 {
845 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
846 }
847 
848 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
849 		struct nvme_ns *ns)
850 {
851 	ns->ana_grpid = le32_to_cpu(desc->grpid);
852 	ns->ana_state = desc->state;
853 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
854 	/*
855 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
856 	 * and in turn to this path device.  However we cannot accept this I/O
857 	 * if the controller is not live.  This may deadlock if called from
858 	 * nvme_mpath_init_identify() and the ctrl will never complete
859 	 * initialization, preventing I/O from completing.  For this case we
860 	 * will reprocess the ANA log page in nvme_mpath_update() once the
861 	 * controller is ready.
862 	 */
863 	if (nvme_state_is_live(ns->ana_state) &&
864 	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
865 		nvme_mpath_set_live(ns);
866 	else {
867 		/*
868 		 * Add sysfs link from multipath head gendisk node to path
869 		 * device gendisk node.
870 		 * If path's ana state is live (i.e. state is either optimized
871 		 * or non-optimized) while we alloc the ns then sysfs link would
872 		 * be created from nvme_mpath_set_live(). In that case we would
873 		 * not fallthrough this code path. However for the path's ana
874 		 * state other than live, we call nvme_mpath_set_live() only
875 		 * after ana state transitioned to the live state. But we still
876 		 * want to create the sysfs link from head node to a path device
877 		 * irrespctive of the path's ana state.
878 		 * If we reach through here then it means that path's ana state
879 		 * is not live but still create the sysfs link to this path from
880 		 * head node if head node of the path has already come alive.
881 		 */
882 		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
883 			nvme_mpath_add_sysfs_link(ns->head);
884 	}
885 }
886 
887 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
888 		struct nvme_ana_group_desc *desc, void *data)
889 {
890 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
891 	unsigned *nr_change_groups = data;
892 	struct nvme_ns *ns;
893 	int srcu_idx;
894 
895 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
896 			le32_to_cpu(desc->grpid),
897 			nvme_ana_state_names[desc->state]);
898 
899 	if (desc->state == NVME_ANA_CHANGE)
900 		(*nr_change_groups)++;
901 
902 	if (!nr_nsids)
903 		return 0;
904 
905 	srcu_idx = srcu_read_lock(&ctrl->srcu);
906 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
907 				 srcu_read_lock_held(&ctrl->srcu)) {
908 		unsigned nsid;
909 again:
910 		nsid = le32_to_cpu(desc->nsids[n]);
911 		if (ns->head->ns_id < nsid)
912 			continue;
913 		if (ns->head->ns_id == nsid)
914 			nvme_update_ns_ana_state(desc, ns);
915 		if (++n == nr_nsids)
916 			break;
917 		if (ns->head->ns_id > nsid)
918 			goto again;
919 	}
920 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
921 	return 0;
922 }
923 
924 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
925 {
926 	u32 nr_change_groups = 0;
927 	int error;
928 
929 	mutex_lock(&ctrl->ana_lock);
930 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
931 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
932 	if (error) {
933 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
934 		goto out_unlock;
935 	}
936 
937 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
938 			nvme_update_ana_state);
939 	if (error)
940 		goto out_unlock;
941 
942 	/*
943 	 * In theory we should have an ANATT timer per group as they might enter
944 	 * the change state at different times.  But that is a lot of overhead
945 	 * just to protect against a target that keeps entering new changes
946 	 * states while never finishing previous ones.  But we'll still
947 	 * eventually time out once all groups are in change state, so this
948 	 * isn't a big deal.
949 	 *
950 	 * We also double the ANATT value to provide some slack for transports
951 	 * or AEN processing overhead.
952 	 */
953 	if (nr_change_groups)
954 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
955 	else
956 		timer_delete_sync(&ctrl->anatt_timer);
957 out_unlock:
958 	mutex_unlock(&ctrl->ana_lock);
959 	return error;
960 }
961 
962 static void nvme_ana_work(struct work_struct *work)
963 {
964 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
965 
966 	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
967 		return;
968 
969 	nvme_read_ana_log(ctrl);
970 }
971 
972 void nvme_mpath_update(struct nvme_ctrl *ctrl)
973 {
974 	u32 nr_change_groups = 0;
975 
976 	if (!ctrl->ana_log_buf)
977 		return;
978 
979 	mutex_lock(&ctrl->ana_lock);
980 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
981 	mutex_unlock(&ctrl->ana_lock);
982 }
983 
984 static void nvme_anatt_timeout(struct timer_list *t)
985 {
986 	struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
987 
988 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
989 	nvme_reset_ctrl(ctrl);
990 }
991 
992 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
993 {
994 	if (!nvme_ctrl_use_ana(ctrl))
995 		return;
996 	timer_delete_sync(&ctrl->anatt_timer);
997 	cancel_work_sync(&ctrl->ana_work);
998 }
999 
1000 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
1001 	struct device_attribute subsys_attr_##_name =	\
1002 		__ATTR(_name, _mode, _show, _store)
1003 
1004 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1005 		struct device_attribute *attr, char *buf)
1006 {
1007 	struct nvme_subsystem *subsys =
1008 		container_of(dev, struct nvme_subsystem, dev);
1009 
1010 	return sysfs_emit(buf, "%s\n",
1011 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1012 }
1013 
1014 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1015 		int iopolicy)
1016 {
1017 	struct nvme_ctrl *ctrl;
1018 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
1019 
1020 	if (old_iopolicy == iopolicy)
1021 		return;
1022 
1023 	WRITE_ONCE(subsys->iopolicy, iopolicy);
1024 
1025 	/* iopolicy changes clear the mpath by design */
1026 	mutex_lock(&nvme_subsystems_lock);
1027 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1028 		nvme_mpath_clear_ctrl_paths(ctrl);
1029 	mutex_unlock(&nvme_subsystems_lock);
1030 
1031 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1032 			subsys->subnqn,
1033 			nvme_iopolicy_names[old_iopolicy],
1034 			nvme_iopolicy_names[iopolicy]);
1035 }
1036 
1037 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1038 		struct device_attribute *attr, const char *buf, size_t count)
1039 {
1040 	struct nvme_subsystem *subsys =
1041 		container_of(dev, struct nvme_subsystem, dev);
1042 	int i;
1043 
1044 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
1045 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
1046 			nvme_subsys_iopolicy_update(subsys, i);
1047 			return count;
1048 		}
1049 	}
1050 
1051 	return -EINVAL;
1052 }
1053 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1054 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1055 
1056 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1057 		char *buf)
1058 {
1059 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1060 }
1061 DEVICE_ATTR_RO(ana_grpid);
1062 
1063 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1064 		char *buf)
1065 {
1066 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1067 
1068 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1069 }
1070 DEVICE_ATTR_RO(ana_state);
1071 
1072 static ssize_t queue_depth_show(struct device *dev,
1073 		struct device_attribute *attr, char *buf)
1074 {
1075 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1076 
1077 	if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1078 		return 0;
1079 
1080 	return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1081 }
1082 DEVICE_ATTR_RO(queue_depth);
1083 
1084 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1085 		char *buf)
1086 {
1087 	int node, srcu_idx;
1088 	nodemask_t numa_nodes;
1089 	struct nvme_ns *current_ns;
1090 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1091 	struct nvme_ns_head *head = ns->head;
1092 
1093 	if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1094 		return 0;
1095 
1096 	nodes_clear(numa_nodes);
1097 
1098 	srcu_idx = srcu_read_lock(&head->srcu);
1099 	for_each_node(node) {
1100 		current_ns = srcu_dereference(head->current_path[node],
1101 				&head->srcu);
1102 		if (ns == current_ns)
1103 			node_set(node, numa_nodes);
1104 	}
1105 	srcu_read_unlock(&head->srcu, srcu_idx);
1106 
1107 	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1108 }
1109 DEVICE_ATTR_RO(numa_nodes);
1110 
1111 static ssize_t delayed_removal_secs_show(struct device *dev,
1112 		struct device_attribute *attr, char *buf)
1113 {
1114 	struct gendisk *disk = dev_to_disk(dev);
1115 	struct nvme_ns_head *head = disk->private_data;
1116 	int ret;
1117 
1118 	mutex_lock(&head->subsys->lock);
1119 	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1120 	mutex_unlock(&head->subsys->lock);
1121 	return ret;
1122 }
1123 
1124 static ssize_t delayed_removal_secs_store(struct device *dev,
1125 		struct device_attribute *attr, const char *buf, size_t count)
1126 {
1127 	struct gendisk *disk = dev_to_disk(dev);
1128 	struct nvme_ns_head *head = disk->private_data;
1129 	unsigned int sec;
1130 	int ret;
1131 
1132 	ret = kstrtouint(buf, 0, &sec);
1133 	if (ret < 0)
1134 		return ret;
1135 
1136 	mutex_lock(&head->subsys->lock);
1137 	head->delayed_removal_secs = sec;
1138 	if (sec)
1139 		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1140 	else
1141 		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1142 	mutex_unlock(&head->subsys->lock);
1143 	/*
1144 	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1145 	 * by its reader.
1146 	 */
1147 	synchronize_srcu(&head->srcu);
1148 
1149 	return count;
1150 }
1151 
1152 DEVICE_ATTR_RW(delayed_removal_secs);
1153 
1154 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1155 		struct nvme_ana_group_desc *desc, void *data)
1156 {
1157 	struct nvme_ana_group_desc *dst = data;
1158 
1159 	if (desc->grpid != dst->grpid)
1160 		return 0;
1161 
1162 	*dst = *desc;
1163 	return -ENXIO; /* just break out of the loop */
1164 }
1165 
1166 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1167 {
1168 	struct device *target;
1169 	int rc, srcu_idx;
1170 	struct nvme_ns *ns;
1171 	struct kobject *kobj;
1172 
1173 	/*
1174 	 * Ensure head disk node is already added otherwise we may get invalid
1175 	 * kobj for head disk node
1176 	 */
1177 	if (!test_bit(GD_ADDED, &head->disk->state))
1178 		return;
1179 
1180 	kobj = &disk_to_dev(head->disk)->kobj;
1181 
1182 	/*
1183 	 * loop through each ns chained through the head->list and create the
1184 	 * sysfs link from head node to the ns path node
1185 	 */
1186 	srcu_idx = srcu_read_lock(&head->srcu);
1187 
1188 	list_for_each_entry_srcu(ns, &head->list, siblings,
1189 				 srcu_read_lock_held(&head->srcu)) {
1190 		/*
1191 		 * Ensure that ns path disk node is already added otherwise we
1192 		 * may get invalid kobj name for target
1193 		 */
1194 		if (!test_bit(GD_ADDED, &ns->disk->state))
1195 			continue;
1196 
1197 		/*
1198 		 * Avoid creating link if it already exists for the given path.
1199 		 * When path ana state transitions from optimized to non-
1200 		 * optimized or vice-versa, the nvme_mpath_set_live() is
1201 		 * invoked which in truns call this function. Now if the sysfs
1202 		 * link already exists for the given path and we attempt to re-
1203 		 * create the link then sysfs code would warn about it loudly.
1204 		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1205 		 * that we're not creating duplicate link.
1206 		 * The test_and_set_bit() is used because it is protecting
1207 		 * against multiple nvme paths being simultaneously added.
1208 		 */
1209 		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1210 			continue;
1211 
1212 		target = disk_to_dev(ns->disk);
1213 		/*
1214 		 * Create sysfs link from head gendisk kobject @kobj to the
1215 		 * ns path gendisk kobject @target->kobj.
1216 		 */
1217 		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1218 				&target->kobj, dev_name(target));
1219 		if (unlikely(rc)) {
1220 			dev_err(disk_to_dev(ns->head->disk),
1221 					"failed to create link to %s\n",
1222 					dev_name(target));
1223 			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1224 		}
1225 	}
1226 
1227 	srcu_read_unlock(&head->srcu, srcu_idx);
1228 }
1229 
1230 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1231 {
1232 	struct device *target;
1233 	struct kobject *kobj;
1234 
1235 	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1236 		return;
1237 
1238 	target = disk_to_dev(ns->disk);
1239 	kobj = &disk_to_dev(ns->head->disk)->kobj;
1240 	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1241 			dev_name(target));
1242 	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1243 }
1244 
1245 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1246 {
1247 	if (nvme_ctrl_use_ana(ns->ctrl)) {
1248 		struct nvme_ana_group_desc desc = {
1249 			.grpid = anagrpid,
1250 			.state = 0,
1251 		};
1252 
1253 		mutex_lock(&ns->ctrl->ana_lock);
1254 		ns->ana_grpid = le32_to_cpu(anagrpid);
1255 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1256 		mutex_unlock(&ns->ctrl->ana_lock);
1257 		if (desc.state) {
1258 			/* found the group desc: update */
1259 			nvme_update_ns_ana_state(&desc, ns);
1260 		} else {
1261 			/* group desc not found: trigger a re-read */
1262 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1263 			queue_work(nvme_wq, &ns->ctrl->ana_work);
1264 		}
1265 	} else {
1266 		ns->ana_state = NVME_ANA_OPTIMIZED;
1267 		nvme_mpath_set_live(ns);
1268 	}
1269 
1270 #ifdef CONFIG_BLK_DEV_ZONED
1271 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1272 		ns->head->disk->nr_zones = ns->disk->nr_zones;
1273 #endif
1274 }
1275 
1276 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1277 {
1278 	bool remove = false;
1279 
1280 	if (!head->disk)
1281 		return;
1282 
1283 	mutex_lock(&head->subsys->lock);
1284 	/*
1285 	 * We are called when all paths have been removed, and at that point
1286 	 * head->list is expected to be empty. However, nvme_ns_remove() and
1287 	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1288 	 * removal_secs is configured, it is possible that by the time we reach
1289 	 * this point, head->list may no longer be empty. Therefore, we recheck
1290 	 * head->list here. If it is no longer empty then we skip enqueuing the
1291 	 * delayed head removal work.
1292 	 */
1293 	if (!list_empty(&head->list))
1294 		goto out;
1295 
1296 	/*
1297 	 * Ensure that no one could remove this module while the head
1298 	 * remove work is pending.
1299 	 */
1300 	if (head->delayed_removal_secs && try_module_get(THIS_MODULE)) {
1301 		mod_delayed_work(nvme_wq, &head->remove_work,
1302 				head->delayed_removal_secs * HZ);
1303 	} else {
1304 		list_del_init(&head->entry);
1305 		remove = true;
1306 	}
1307 out:
1308 	mutex_unlock(&head->subsys->lock);
1309 	if (remove)
1310 		nvme_remove_head(head);
1311 }
1312 
1313 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1314 {
1315 	if (!head->disk)
1316 		return;
1317 	/* make sure all pending bios are cleaned up */
1318 	kblockd_schedule_work(&head->requeue_work);
1319 	flush_work(&head->requeue_work);
1320 	flush_work(&head->partition_scan_work);
1321 	put_disk(head->disk);
1322 }
1323 
1324 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1325 {
1326 	mutex_init(&ctrl->ana_lock);
1327 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1328 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1329 }
1330 
1331 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1332 {
1333 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1334 	size_t ana_log_size;
1335 	int error = 0;
1336 
1337 	/* check if multipath is enabled and we have the capability */
1338 	if (!multipath || !ctrl->subsys ||
1339 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1340 		return 0;
1341 
1342 	/* initialize this in the identify path to cover controller resets */
1343 	atomic_set(&ctrl->nr_active, 0);
1344 
1345 	if (!ctrl->max_namespaces ||
1346 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1347 		dev_err(ctrl->device,
1348 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1349 		return -EINVAL;
1350 	}
1351 
1352 	ctrl->anacap = id->anacap;
1353 	ctrl->anatt = id->anatt;
1354 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1355 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1356 
1357 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1358 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1359 		ctrl->max_namespaces * sizeof(__le32);
1360 	if (ana_log_size > max_transfer_size) {
1361 		dev_err(ctrl->device,
1362 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
1363 			ana_log_size, max_transfer_size);
1364 		dev_err(ctrl->device, "disabling ANA support.\n");
1365 		goto out_uninit;
1366 	}
1367 	if (ana_log_size > ctrl->ana_log_size) {
1368 		nvme_mpath_stop(ctrl);
1369 		nvme_mpath_uninit(ctrl);
1370 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1371 		if (!ctrl->ana_log_buf)
1372 			return -ENOMEM;
1373 	}
1374 	ctrl->ana_log_size = ana_log_size;
1375 	error = nvme_read_ana_log(ctrl);
1376 	if (error)
1377 		goto out_uninit;
1378 	return 0;
1379 
1380 out_uninit:
1381 	nvme_mpath_uninit(ctrl);
1382 	return error;
1383 }
1384 
1385 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1386 {
1387 	kvfree(ctrl->ana_log_buf);
1388 	ctrl->ana_log_buf = NULL;
1389 	ctrl->ana_log_size = 0;
1390 }
1391