xref: /linux/drivers/nvme/host/multipath.c (revision e3966940559d52aa1800a008dcfeec218dd31f88)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2018 Christoph Hellwig.
4  */
5 
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11 
12 bool multipath = true;
13 static bool multipath_always_on;
14 
15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 	int ret;
18 	bool *arg = kp->arg;
19 
20 	ret = param_set_bool(val, kp);
21 	if (ret)
22 		return ret;
23 
24 	if (multipath_always_on && !*arg) {
25 		pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 		*arg = true;
27 		return -EINVAL;
28 	}
29 
30 	return 0;
31 }
32 
33 static const struct kernel_param_ops multipath_param_ops = {
34 	.set = multipath_param_set,
35 	.get = param_get_bool,
36 };
37 
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 	"turn on native support for multiple controllers per subsystem");
41 
42 static int multipath_always_on_set(const char *val,
43 		const struct kernel_param *kp)
44 {
45 	int ret;
46 	bool *arg = kp->arg;
47 
48 	ret = param_set_bool(val, kp);
49 	if (ret < 0)
50 		return ret;
51 
52 	if (*arg)
53 		multipath = true;
54 
55 	return 0;
56 }
57 
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 	.set = multipath_always_on_set,
60 	.get = param_get_bool,
61 };
62 
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 		&multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 	"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67 
68 static const char *nvme_iopolicy_names[] = {
69 	[NVME_IOPOLICY_NUMA]	= "numa",
70 	[NVME_IOPOLICY_RR]	= "round-robin",
71 	[NVME_IOPOLICY_QD]      = "queue-depth",
72 };
73 
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75 
76 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
77 {
78 	if (!val)
79 		return -EINVAL;
80 	if (!strncmp(val, "numa", 4))
81 		iopolicy = NVME_IOPOLICY_NUMA;
82 	else if (!strncmp(val, "round-robin", 11))
83 		iopolicy = NVME_IOPOLICY_RR;
84 	else if (!strncmp(val, "queue-depth", 11))
85 		iopolicy = NVME_IOPOLICY_QD;
86 	else
87 		return -EINVAL;
88 
89 	return 0;
90 }
91 
92 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
93 {
94 	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
95 }
96 
97 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
98 	&iopolicy, 0644);
99 MODULE_PARM_DESC(iopolicy,
100 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
101 
102 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
103 {
104 	subsys->iopolicy = iopolicy;
105 }
106 
107 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
108 {
109 	struct nvme_ns_head *h;
110 
111 	lockdep_assert_held(&subsys->lock);
112 	list_for_each_entry(h, &subsys->nsheads, entry)
113 		if (h->disk)
114 			blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
115 }
116 
117 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
118 {
119 	struct nvme_ns_head *h;
120 
121 	lockdep_assert_held(&subsys->lock);
122 	list_for_each_entry(h, &subsys->nsheads, entry)
123 		if (h->disk)
124 			blk_mq_freeze_queue_wait(h->disk->queue);
125 }
126 
127 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
128 {
129 	struct nvme_ns_head *h;
130 
131 	lockdep_assert_held(&subsys->lock);
132 	list_for_each_entry(h, &subsys->nsheads, entry)
133 		if (h->disk)
134 			blk_freeze_queue_start(h->disk->queue);
135 }
136 
137 void nvme_failover_req(struct request *req)
138 {
139 	struct nvme_ns *ns = req->q->queuedata;
140 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
141 	unsigned long flags;
142 	struct bio *bio;
143 
144 	nvme_mpath_clear_current_path(ns);
145 
146 	/*
147 	 * If we got back an ANA error, we know the controller is alive but not
148 	 * ready to serve this namespace.  Kick of a re-read of the ANA
149 	 * information page, and just try any other available path for now.
150 	 */
151 	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
152 		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
153 		queue_work(nvme_wq, &ns->ctrl->ana_work);
154 	}
155 
156 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
157 	for (bio = req->bio; bio; bio = bio->bi_next) {
158 		bio_set_dev(bio, ns->head->disk->part0);
159 		if (bio->bi_opf & REQ_POLLED) {
160 			bio->bi_opf &= ~REQ_POLLED;
161 			bio->bi_cookie = BLK_QC_T_NONE;
162 		}
163 		/*
164 		 * The alternate request queue that we may end up submitting
165 		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
166 		 * will fail the I/O immediately with EAGAIN to the issuer.
167 		 * We are not in the issuer context which cannot block. Clear
168 		 * the flag to avoid spurious EAGAIN I/O failures.
169 		 */
170 		bio->bi_opf &= ~REQ_NOWAIT;
171 	}
172 	blk_steal_bios(&ns->head->requeue_list, req);
173 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
174 
175 	nvme_req(req)->status = 0;
176 	nvme_end_req(req);
177 	kblockd_schedule_work(&ns->head->requeue_work);
178 }
179 
180 void nvme_mpath_start_request(struct request *rq)
181 {
182 	struct nvme_ns *ns = rq->q->queuedata;
183 	struct gendisk *disk = ns->head->disk;
184 
185 	if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
186 	    !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
187 		atomic_inc(&ns->ctrl->nr_active);
188 		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
189 	}
190 
191 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) ||
192 	    (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
193 		return;
194 
195 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
196 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
197 						      jiffies);
198 }
199 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
200 
201 void nvme_mpath_end_request(struct request *rq)
202 {
203 	struct nvme_ns *ns = rq->q->queuedata;
204 
205 	if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
206 		atomic_dec_if_positive(&ns->ctrl->nr_active);
207 
208 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
209 		return;
210 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
211 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
212 			 nvme_req(rq)->start_time);
213 }
214 
215 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
216 {
217 	struct nvme_ns *ns;
218 	int srcu_idx;
219 
220 	srcu_idx = srcu_read_lock(&ctrl->srcu);
221 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
222 				 srcu_read_lock_held(&ctrl->srcu)) {
223 		if (!ns->head->disk)
224 			continue;
225 		kblockd_schedule_work(&ns->head->requeue_work);
226 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
227 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
228 	}
229 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
230 }
231 
232 static const char *nvme_ana_state_names[] = {
233 	[0]				= "invalid state",
234 	[NVME_ANA_OPTIMIZED]		= "optimized",
235 	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
236 	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
237 	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
238 	[NVME_ANA_CHANGE]		= "change",
239 };
240 
241 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
242 {
243 	struct nvme_ns_head *head = ns->head;
244 	bool changed = false;
245 	int node;
246 
247 	if (!head)
248 		goto out;
249 
250 	for_each_node(node) {
251 		if (ns == rcu_access_pointer(head->current_path[node])) {
252 			rcu_assign_pointer(head->current_path[node], NULL);
253 			changed = true;
254 		}
255 	}
256 out:
257 	return changed;
258 }
259 
260 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
261 {
262 	struct nvme_ns *ns;
263 	int srcu_idx;
264 
265 	srcu_idx = srcu_read_lock(&ctrl->srcu);
266 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
267 				 srcu_read_lock_held(&ctrl->srcu)) {
268 		nvme_mpath_clear_current_path(ns);
269 		kblockd_schedule_work(&ns->head->requeue_work);
270 	}
271 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
272 }
273 
274 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
275 {
276 	struct nvme_ns_head *head = ns->head;
277 	sector_t capacity = get_capacity(head->disk);
278 	int node;
279 	int srcu_idx;
280 
281 	srcu_idx = srcu_read_lock(&head->srcu);
282 	list_for_each_entry_srcu(ns, &head->list, siblings,
283 				 srcu_read_lock_held(&head->srcu)) {
284 		if (capacity != get_capacity(ns->disk))
285 			clear_bit(NVME_NS_READY, &ns->flags);
286 	}
287 	srcu_read_unlock(&head->srcu, srcu_idx);
288 
289 	for_each_node(node)
290 		rcu_assign_pointer(head->current_path[node], NULL);
291 	kblockd_schedule_work(&head->requeue_work);
292 }
293 
294 static bool nvme_path_is_disabled(struct nvme_ns *ns)
295 {
296 	enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
297 
298 	/*
299 	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
300 	 * still be able to complete assuming that the controller is connected.
301 	 * Otherwise it will fail immediately and return to the requeue list.
302 	 */
303 	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
304 		return true;
305 	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
306 	    !test_bit(NVME_NS_READY, &ns->flags))
307 		return true;
308 	return false;
309 }
310 
311 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
312 {
313 	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
314 	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
315 
316 	list_for_each_entry_srcu(ns, &head->list, siblings,
317 				 srcu_read_lock_held(&head->srcu)) {
318 		if (nvme_path_is_disabled(ns))
319 			continue;
320 
321 		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
322 		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
323 			distance = node_distance(node, ns->ctrl->numa_node);
324 		else
325 			distance = LOCAL_DISTANCE;
326 
327 		switch (ns->ana_state) {
328 		case NVME_ANA_OPTIMIZED:
329 			if (distance < found_distance) {
330 				found_distance = distance;
331 				found = ns;
332 			}
333 			break;
334 		case NVME_ANA_NONOPTIMIZED:
335 			if (distance < fallback_distance) {
336 				fallback_distance = distance;
337 				fallback = ns;
338 			}
339 			break;
340 		default:
341 			break;
342 		}
343 	}
344 
345 	if (!found)
346 		found = fallback;
347 	if (found)
348 		rcu_assign_pointer(head->current_path[node], found);
349 	return found;
350 }
351 
352 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
353 		struct nvme_ns *ns)
354 {
355 	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
356 			siblings);
357 	if (ns)
358 		return ns;
359 	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
360 }
361 
362 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
363 {
364 	struct nvme_ns *ns, *found = NULL;
365 	int node = numa_node_id();
366 	struct nvme_ns *old = srcu_dereference(head->current_path[node],
367 					       &head->srcu);
368 
369 	if (unlikely(!old))
370 		return __nvme_find_path(head, node);
371 
372 	if (list_is_singular(&head->list)) {
373 		if (nvme_path_is_disabled(old))
374 			return NULL;
375 		return old;
376 	}
377 
378 	for (ns = nvme_next_ns(head, old);
379 	     ns && ns != old;
380 	     ns = nvme_next_ns(head, ns)) {
381 		if (nvme_path_is_disabled(ns))
382 			continue;
383 
384 		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
385 			found = ns;
386 			goto out;
387 		}
388 		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
389 			found = ns;
390 	}
391 
392 	/*
393 	 * The loop above skips the current path for round-robin semantics.
394 	 * Fall back to the current path if either:
395 	 *  - no other optimized path found and current is optimized,
396 	 *  - no other usable path found and current is usable.
397 	 */
398 	if (!nvme_path_is_disabled(old) &&
399 	    (old->ana_state == NVME_ANA_OPTIMIZED ||
400 	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
401 		return old;
402 
403 	if (!found)
404 		return NULL;
405 out:
406 	rcu_assign_pointer(head->current_path[node], found);
407 	return found;
408 }
409 
410 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
411 {
412 	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
413 	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
414 	unsigned int depth;
415 
416 	list_for_each_entry_srcu(ns, &head->list, siblings,
417 				 srcu_read_lock_held(&head->srcu)) {
418 		if (nvme_path_is_disabled(ns))
419 			continue;
420 
421 		depth = atomic_read(&ns->ctrl->nr_active);
422 
423 		switch (ns->ana_state) {
424 		case NVME_ANA_OPTIMIZED:
425 			if (depth < min_depth_opt) {
426 				min_depth_opt = depth;
427 				best_opt = ns;
428 			}
429 			break;
430 		case NVME_ANA_NONOPTIMIZED:
431 			if (depth < min_depth_nonopt) {
432 				min_depth_nonopt = depth;
433 				best_nonopt = ns;
434 			}
435 			break;
436 		default:
437 			break;
438 		}
439 
440 		if (min_depth_opt == 0)
441 			return best_opt;
442 	}
443 
444 	return best_opt ? best_opt : best_nonopt;
445 }
446 
447 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
448 {
449 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
450 		ns->ana_state == NVME_ANA_OPTIMIZED;
451 }
452 
453 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
454 {
455 	int node = numa_node_id();
456 	struct nvme_ns *ns;
457 
458 	ns = srcu_dereference(head->current_path[node], &head->srcu);
459 	if (unlikely(!ns))
460 		return __nvme_find_path(head, node);
461 	if (unlikely(!nvme_path_is_optimized(ns)))
462 		return __nvme_find_path(head, node);
463 	return ns;
464 }
465 
466 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
467 {
468 	switch (READ_ONCE(head->subsys->iopolicy)) {
469 	case NVME_IOPOLICY_QD:
470 		return nvme_queue_depth_path(head);
471 	case NVME_IOPOLICY_RR:
472 		return nvme_round_robin_path(head);
473 	default:
474 		return nvme_numa_path(head);
475 	}
476 }
477 
478 static bool nvme_available_path(struct nvme_ns_head *head)
479 {
480 	struct nvme_ns *ns;
481 
482 	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
483 		return false;
484 
485 	list_for_each_entry_srcu(ns, &head->list, siblings,
486 				 srcu_read_lock_held(&head->srcu)) {
487 		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
488 			continue;
489 		switch (nvme_ctrl_state(ns->ctrl)) {
490 		case NVME_CTRL_LIVE:
491 		case NVME_CTRL_RESETTING:
492 		case NVME_CTRL_CONNECTING:
493 			return true;
494 		default:
495 			break;
496 		}
497 	}
498 
499 	/*
500 	 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
501 	 * not immediately fail I/O. Instead, requeue the I/O for the configured
502 	 * duration, anticipating that if there's a transient link failure then
503 	 * it may recover within this time window. This parameter is exported to
504 	 * userspace via sysfs, and its default value is zero. It is internally
505 	 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
506 	 * non-zero, this flag is set to true. When zero, the flag is cleared.
507 	 */
508 	return nvme_mpath_queue_if_no_path(head);
509 }
510 
511 static void nvme_ns_head_submit_bio(struct bio *bio)
512 {
513 	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
514 	struct device *dev = disk_to_dev(head->disk);
515 	struct nvme_ns *ns;
516 	int srcu_idx;
517 
518 	/*
519 	 * The namespace might be going away and the bio might be moved to a
520 	 * different queue via blk_steal_bios(), so we need to use the bio_split
521 	 * pool from the original queue to allocate the bvecs from.
522 	 */
523 	bio = bio_split_to_limits(bio);
524 	if (!bio)
525 		return;
526 
527 	srcu_idx = srcu_read_lock(&head->srcu);
528 	ns = nvme_find_path(head);
529 	if (likely(ns)) {
530 		bio_set_dev(bio, ns->disk->part0);
531 		bio->bi_opf |= REQ_NVME_MPATH;
532 		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
533 				      bio->bi_iter.bi_sector);
534 		submit_bio_noacct(bio);
535 	} else if (nvme_available_path(head)) {
536 		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
537 
538 		spin_lock_irq(&head->requeue_lock);
539 		bio_list_add(&head->requeue_list, bio);
540 		spin_unlock_irq(&head->requeue_lock);
541 	} else {
542 		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
543 
544 		bio_io_error(bio);
545 	}
546 
547 	srcu_read_unlock(&head->srcu, srcu_idx);
548 }
549 
550 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
551 {
552 	if (!nvme_tryget_ns_head(disk->private_data))
553 		return -ENXIO;
554 	return 0;
555 }
556 
557 static void nvme_ns_head_release(struct gendisk *disk)
558 {
559 	nvme_put_ns_head(disk->private_data);
560 }
561 
562 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
563 		enum blk_unique_id type)
564 {
565 	struct nvme_ns_head *head = disk->private_data;
566 	struct nvme_ns *ns;
567 	int srcu_idx, ret = -EWOULDBLOCK;
568 
569 	srcu_idx = srcu_read_lock(&head->srcu);
570 	ns = nvme_find_path(head);
571 	if (ns)
572 		ret = nvme_ns_get_unique_id(ns, id, type);
573 	srcu_read_unlock(&head->srcu, srcu_idx);
574 	return ret;
575 }
576 
577 #ifdef CONFIG_BLK_DEV_ZONED
578 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
579 		unsigned int nr_zones, report_zones_cb cb, void *data)
580 {
581 	struct nvme_ns_head *head = disk->private_data;
582 	struct nvme_ns *ns;
583 	int srcu_idx, ret = -EWOULDBLOCK;
584 
585 	srcu_idx = srcu_read_lock(&head->srcu);
586 	ns = nvme_find_path(head);
587 	if (ns)
588 		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
589 	srcu_read_unlock(&head->srcu, srcu_idx);
590 	return ret;
591 }
592 #else
593 #define nvme_ns_head_report_zones	NULL
594 #endif /* CONFIG_BLK_DEV_ZONED */
595 
596 const struct block_device_operations nvme_ns_head_ops = {
597 	.owner		= THIS_MODULE,
598 	.submit_bio	= nvme_ns_head_submit_bio,
599 	.open		= nvme_ns_head_open,
600 	.release	= nvme_ns_head_release,
601 	.ioctl		= nvme_ns_head_ioctl,
602 	.compat_ioctl	= blkdev_compat_ptr_ioctl,
603 	.getgeo		= nvme_getgeo,
604 	.get_unique_id	= nvme_ns_head_get_unique_id,
605 	.report_zones	= nvme_ns_head_report_zones,
606 	.pr_ops		= &nvme_pr_ops,
607 };
608 
609 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
610 {
611 	return container_of(cdev, struct nvme_ns_head, cdev);
612 }
613 
614 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
615 {
616 	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
617 		return -ENXIO;
618 	return 0;
619 }
620 
621 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
622 {
623 	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
624 	return 0;
625 }
626 
627 static const struct file_operations nvme_ns_head_chr_fops = {
628 	.owner		= THIS_MODULE,
629 	.open		= nvme_ns_head_chr_open,
630 	.release	= nvme_ns_head_chr_release,
631 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
632 	.compat_ioctl	= compat_ptr_ioctl,
633 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
634 	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
635 };
636 
637 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
638 {
639 	int ret;
640 
641 	head->cdev_device.parent = &head->subsys->dev;
642 	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
643 			   head->subsys->instance, head->instance);
644 	if (ret)
645 		return ret;
646 	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
647 			    &nvme_ns_head_chr_fops, THIS_MODULE);
648 	return ret;
649 }
650 
651 static void nvme_partition_scan_work(struct work_struct *work)
652 {
653 	struct nvme_ns_head *head =
654 		container_of(work, struct nvme_ns_head, partition_scan_work);
655 
656 	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
657 					     &head->disk->state)))
658 		return;
659 
660 	mutex_lock(&head->disk->open_mutex);
661 	bdev_disk_changed(head->disk, false);
662 	mutex_unlock(&head->disk->open_mutex);
663 }
664 
665 static void nvme_requeue_work(struct work_struct *work)
666 {
667 	struct nvme_ns_head *head =
668 		container_of(work, struct nvme_ns_head, requeue_work);
669 	struct bio *bio, *next;
670 
671 	spin_lock_irq(&head->requeue_lock);
672 	next = bio_list_get(&head->requeue_list);
673 	spin_unlock_irq(&head->requeue_lock);
674 
675 	while ((bio = next) != NULL) {
676 		next = bio->bi_next;
677 		bio->bi_next = NULL;
678 
679 		submit_bio_noacct(bio);
680 	}
681 }
682 
683 static void nvme_remove_head(struct nvme_ns_head *head)
684 {
685 	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
686 		/*
687 		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
688 		 * to allow multipath to fail all I/O.
689 		 */
690 		kblockd_schedule_work(&head->requeue_work);
691 
692 		nvme_cdev_del(&head->cdev, &head->cdev_device);
693 		synchronize_srcu(&head->srcu);
694 		del_gendisk(head->disk);
695 	}
696 	nvme_put_ns_head(head);
697 }
698 
699 static void nvme_remove_head_work(struct work_struct *work)
700 {
701 	struct nvme_ns_head *head = container_of(to_delayed_work(work),
702 			struct nvme_ns_head, remove_work);
703 	bool remove = false;
704 
705 	mutex_lock(&head->subsys->lock);
706 	if (list_empty(&head->list)) {
707 		list_del_init(&head->entry);
708 		remove = true;
709 	}
710 	mutex_unlock(&head->subsys->lock);
711 	if (remove)
712 		nvme_remove_head(head);
713 
714 	module_put(THIS_MODULE);
715 }
716 
717 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
718 {
719 	struct queue_limits lim;
720 
721 	mutex_init(&head->lock);
722 	bio_list_init(&head->requeue_list);
723 	spin_lock_init(&head->requeue_lock);
724 	INIT_WORK(&head->requeue_work, nvme_requeue_work);
725 	INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
726 	INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
727 	head->delayed_removal_secs = 0;
728 
729 	/*
730 	 * If "multipath_always_on" is enabled, a multipath node is added
731 	 * regardless of whether the disk is single/multi ported, and whether
732 	 * the namespace is shared or private. If "multipath_always_on" is not
733 	 * enabled, a multipath node is added only if the subsystem supports
734 	 * multiple controllers and the "multipath" option is configured. In
735 	 * either case, for private namespaces, we ensure that the NSID is
736 	 * unique.
737 	 */
738 	if (!multipath_always_on) {
739 		if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
740 				!multipath)
741 			return 0;
742 	}
743 
744 	if (!nvme_is_unique_nsid(ctrl, head))
745 		return 0;
746 
747 	blk_set_stacking_limits(&lim);
748 	lim.dma_alignment = 3;
749 	lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
750 		BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
751 	if (head->ids.csi == NVME_CSI_ZNS)
752 		lim.features |= BLK_FEAT_ZONED;
753 
754 	head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
755 	if (IS_ERR(head->disk))
756 		return PTR_ERR(head->disk);
757 	head->disk->fops = &nvme_ns_head_ops;
758 	head->disk->private_data = head;
759 
760 	/*
761 	 * We need to suppress the partition scan from occuring within the
762 	 * controller's scan_work context. If a path error occurs here, the IO
763 	 * will wait until a path becomes available or all paths are torn down,
764 	 * but that action also occurs within scan_work, so it would deadlock.
765 	 * Defer the partition scan to a different context that does not block
766 	 * scan_work.
767 	 */
768 	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
769 	sprintf(head->disk->disk_name, "nvme%dn%d",
770 			ctrl->subsys->instance, head->instance);
771 	nvme_tryget_ns_head(head);
772 	return 0;
773 }
774 
775 static void nvme_mpath_set_live(struct nvme_ns *ns)
776 {
777 	struct nvme_ns_head *head = ns->head;
778 	int rc;
779 
780 	if (!head->disk)
781 		return;
782 
783 	/*
784 	 * test_and_set_bit() is used because it is protecting against two nvme
785 	 * paths simultaneously calling device_add_disk() on the same namespace
786 	 * head.
787 	 */
788 	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
789 		rc = device_add_disk(&head->subsys->dev, head->disk,
790 				     nvme_ns_attr_groups);
791 		if (rc) {
792 			clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
793 			return;
794 		}
795 		nvme_add_ns_head_cdev(head);
796 		kblockd_schedule_work(&head->partition_scan_work);
797 	}
798 
799 	nvme_mpath_add_sysfs_link(ns->head);
800 
801 	mutex_lock(&head->lock);
802 	if (nvme_path_is_optimized(ns)) {
803 		int node, srcu_idx;
804 
805 		srcu_idx = srcu_read_lock(&head->srcu);
806 		for_each_online_node(node)
807 			__nvme_find_path(head, node);
808 		srcu_read_unlock(&head->srcu, srcu_idx);
809 	}
810 	mutex_unlock(&head->lock);
811 
812 	synchronize_srcu(&head->srcu);
813 	kblockd_schedule_work(&head->requeue_work);
814 }
815 
816 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
817 		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
818 			void *))
819 {
820 	void *base = ctrl->ana_log_buf;
821 	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
822 	int error, i;
823 
824 	lockdep_assert_held(&ctrl->ana_lock);
825 
826 	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
827 		struct nvme_ana_group_desc *desc = base + offset;
828 		u32 nr_nsids;
829 		size_t nsid_buf_size;
830 
831 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
832 			return -EINVAL;
833 
834 		nr_nsids = le32_to_cpu(desc->nnsids);
835 		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
836 
837 		if (WARN_ON_ONCE(desc->grpid == 0))
838 			return -EINVAL;
839 		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
840 			return -EINVAL;
841 		if (WARN_ON_ONCE(desc->state == 0))
842 			return -EINVAL;
843 		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
844 			return -EINVAL;
845 
846 		offset += sizeof(*desc);
847 		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
848 			return -EINVAL;
849 
850 		error = cb(ctrl, desc, data);
851 		if (error)
852 			return error;
853 
854 		offset += nsid_buf_size;
855 	}
856 
857 	return 0;
858 }
859 
860 static inline bool nvme_state_is_live(enum nvme_ana_state state)
861 {
862 	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
863 }
864 
865 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
866 		struct nvme_ns *ns)
867 {
868 	ns->ana_grpid = le32_to_cpu(desc->grpid);
869 	ns->ana_state = desc->state;
870 	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
871 	/*
872 	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
873 	 * and in turn to this path device.  However we cannot accept this I/O
874 	 * if the controller is not live.  This may deadlock if called from
875 	 * nvme_mpath_init_identify() and the ctrl will never complete
876 	 * initialization, preventing I/O from completing.  For this case we
877 	 * will reprocess the ANA log page in nvme_mpath_update() once the
878 	 * controller is ready.
879 	 */
880 	if (nvme_state_is_live(ns->ana_state) &&
881 	    nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
882 		nvme_mpath_set_live(ns);
883 	else {
884 		/*
885 		 * Add sysfs link from multipath head gendisk node to path
886 		 * device gendisk node.
887 		 * If path's ana state is live (i.e. state is either optimized
888 		 * or non-optimized) while we alloc the ns then sysfs link would
889 		 * be created from nvme_mpath_set_live(). In that case we would
890 		 * not fallthrough this code path. However for the path's ana
891 		 * state other than live, we call nvme_mpath_set_live() only
892 		 * after ana state transitioned to the live state. But we still
893 		 * want to create the sysfs link from head node to a path device
894 		 * irrespctive of the path's ana state.
895 		 * If we reach through here then it means that path's ana state
896 		 * is not live but still create the sysfs link to this path from
897 		 * head node if head node of the path has already come alive.
898 		 */
899 		if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
900 			nvme_mpath_add_sysfs_link(ns->head);
901 	}
902 }
903 
904 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
905 		struct nvme_ana_group_desc *desc, void *data)
906 {
907 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
908 	unsigned *nr_change_groups = data;
909 	struct nvme_ns *ns;
910 	int srcu_idx;
911 
912 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
913 			le32_to_cpu(desc->grpid),
914 			nvme_ana_state_names[desc->state]);
915 
916 	if (desc->state == NVME_ANA_CHANGE)
917 		(*nr_change_groups)++;
918 
919 	if (!nr_nsids)
920 		return 0;
921 
922 	srcu_idx = srcu_read_lock(&ctrl->srcu);
923 	list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
924 				 srcu_read_lock_held(&ctrl->srcu)) {
925 		unsigned nsid;
926 again:
927 		nsid = le32_to_cpu(desc->nsids[n]);
928 		if (ns->head->ns_id < nsid)
929 			continue;
930 		if (ns->head->ns_id == nsid)
931 			nvme_update_ns_ana_state(desc, ns);
932 		if (++n == nr_nsids)
933 			break;
934 		if (ns->head->ns_id > nsid)
935 			goto again;
936 	}
937 	srcu_read_unlock(&ctrl->srcu, srcu_idx);
938 	return 0;
939 }
940 
941 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
942 {
943 	u32 nr_change_groups = 0;
944 	int error;
945 
946 	mutex_lock(&ctrl->ana_lock);
947 	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
948 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
949 	if (error) {
950 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
951 		goto out_unlock;
952 	}
953 
954 	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
955 			nvme_update_ana_state);
956 	if (error)
957 		goto out_unlock;
958 
959 	/*
960 	 * In theory we should have an ANATT timer per group as they might enter
961 	 * the change state at different times.  But that is a lot of overhead
962 	 * just to protect against a target that keeps entering new changes
963 	 * states while never finishing previous ones.  But we'll still
964 	 * eventually time out once all groups are in change state, so this
965 	 * isn't a big deal.
966 	 *
967 	 * We also double the ANATT value to provide some slack for transports
968 	 * or AEN processing overhead.
969 	 */
970 	if (nr_change_groups)
971 		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
972 	else
973 		timer_delete_sync(&ctrl->anatt_timer);
974 out_unlock:
975 	mutex_unlock(&ctrl->ana_lock);
976 	return error;
977 }
978 
979 static void nvme_ana_work(struct work_struct *work)
980 {
981 	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
982 
983 	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
984 		return;
985 
986 	nvme_read_ana_log(ctrl);
987 }
988 
989 void nvme_mpath_update(struct nvme_ctrl *ctrl)
990 {
991 	u32 nr_change_groups = 0;
992 
993 	if (!ctrl->ana_log_buf)
994 		return;
995 
996 	mutex_lock(&ctrl->ana_lock);
997 	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
998 	mutex_unlock(&ctrl->ana_lock);
999 }
1000 
1001 static void nvme_anatt_timeout(struct timer_list *t)
1002 {
1003 	struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
1004 
1005 	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
1006 	nvme_reset_ctrl(ctrl);
1007 }
1008 
1009 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
1010 {
1011 	if (!nvme_ctrl_use_ana(ctrl))
1012 		return;
1013 	timer_delete_sync(&ctrl->anatt_timer);
1014 	cancel_work_sync(&ctrl->ana_work);
1015 }
1016 
1017 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
1018 	struct device_attribute subsys_attr_##_name =	\
1019 		__ATTR(_name, _mode, _show, _store)
1020 
1021 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1022 		struct device_attribute *attr, char *buf)
1023 {
1024 	struct nvme_subsystem *subsys =
1025 		container_of(dev, struct nvme_subsystem, dev);
1026 
1027 	return sysfs_emit(buf, "%s\n",
1028 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1029 }
1030 
1031 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1032 		int iopolicy)
1033 {
1034 	struct nvme_ctrl *ctrl;
1035 	int old_iopolicy = READ_ONCE(subsys->iopolicy);
1036 
1037 	if (old_iopolicy == iopolicy)
1038 		return;
1039 
1040 	WRITE_ONCE(subsys->iopolicy, iopolicy);
1041 
1042 	/* iopolicy changes clear the mpath by design */
1043 	mutex_lock(&nvme_subsystems_lock);
1044 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1045 		nvme_mpath_clear_ctrl_paths(ctrl);
1046 	mutex_unlock(&nvme_subsystems_lock);
1047 
1048 	pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1049 			subsys->subnqn,
1050 			nvme_iopolicy_names[old_iopolicy],
1051 			nvme_iopolicy_names[iopolicy]);
1052 }
1053 
1054 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1055 		struct device_attribute *attr, const char *buf, size_t count)
1056 {
1057 	struct nvme_subsystem *subsys =
1058 		container_of(dev, struct nvme_subsystem, dev);
1059 	int i;
1060 
1061 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
1062 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
1063 			nvme_subsys_iopolicy_update(subsys, i);
1064 			return count;
1065 		}
1066 	}
1067 
1068 	return -EINVAL;
1069 }
1070 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1071 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1072 
1073 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1074 		char *buf)
1075 {
1076 	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1077 }
1078 DEVICE_ATTR_RO(ana_grpid);
1079 
1080 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1081 		char *buf)
1082 {
1083 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1084 
1085 	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1086 }
1087 DEVICE_ATTR_RO(ana_state);
1088 
1089 static ssize_t queue_depth_show(struct device *dev,
1090 		struct device_attribute *attr, char *buf)
1091 {
1092 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1093 
1094 	if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1095 		return 0;
1096 
1097 	return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1098 }
1099 DEVICE_ATTR_RO(queue_depth);
1100 
1101 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1102 		char *buf)
1103 {
1104 	int node, srcu_idx;
1105 	nodemask_t numa_nodes;
1106 	struct nvme_ns *current_ns;
1107 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1108 	struct nvme_ns_head *head = ns->head;
1109 
1110 	if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1111 		return 0;
1112 
1113 	nodes_clear(numa_nodes);
1114 
1115 	srcu_idx = srcu_read_lock(&head->srcu);
1116 	for_each_node(node) {
1117 		current_ns = srcu_dereference(head->current_path[node],
1118 				&head->srcu);
1119 		if (ns == current_ns)
1120 			node_set(node, numa_nodes);
1121 	}
1122 	srcu_read_unlock(&head->srcu, srcu_idx);
1123 
1124 	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1125 }
1126 DEVICE_ATTR_RO(numa_nodes);
1127 
1128 static ssize_t delayed_removal_secs_show(struct device *dev,
1129 		struct device_attribute *attr, char *buf)
1130 {
1131 	struct gendisk *disk = dev_to_disk(dev);
1132 	struct nvme_ns_head *head = disk->private_data;
1133 	int ret;
1134 
1135 	mutex_lock(&head->subsys->lock);
1136 	ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1137 	mutex_unlock(&head->subsys->lock);
1138 	return ret;
1139 }
1140 
1141 static ssize_t delayed_removal_secs_store(struct device *dev,
1142 		struct device_attribute *attr, const char *buf, size_t count)
1143 {
1144 	struct gendisk *disk = dev_to_disk(dev);
1145 	struct nvme_ns_head *head = disk->private_data;
1146 	unsigned int sec;
1147 	int ret;
1148 
1149 	ret = kstrtouint(buf, 0, &sec);
1150 	if (ret < 0)
1151 		return ret;
1152 
1153 	mutex_lock(&head->subsys->lock);
1154 	head->delayed_removal_secs = sec;
1155 	if (sec)
1156 		set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1157 	else
1158 		clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1159 	mutex_unlock(&head->subsys->lock);
1160 	/*
1161 	 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1162 	 * by its reader.
1163 	 */
1164 	synchronize_srcu(&head->srcu);
1165 
1166 	return count;
1167 }
1168 
1169 DEVICE_ATTR_RW(delayed_removal_secs);
1170 
1171 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1172 		struct nvme_ana_group_desc *desc, void *data)
1173 {
1174 	struct nvme_ana_group_desc *dst = data;
1175 
1176 	if (desc->grpid != dst->grpid)
1177 		return 0;
1178 
1179 	*dst = *desc;
1180 	return -ENXIO; /* just break out of the loop */
1181 }
1182 
1183 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1184 {
1185 	struct device *target;
1186 	int rc, srcu_idx;
1187 	struct nvme_ns *ns;
1188 	struct kobject *kobj;
1189 
1190 	/*
1191 	 * Ensure head disk node is already added otherwise we may get invalid
1192 	 * kobj for head disk node
1193 	 */
1194 	if (!test_bit(GD_ADDED, &head->disk->state))
1195 		return;
1196 
1197 	kobj = &disk_to_dev(head->disk)->kobj;
1198 
1199 	/*
1200 	 * loop through each ns chained through the head->list and create the
1201 	 * sysfs link from head node to the ns path node
1202 	 */
1203 	srcu_idx = srcu_read_lock(&head->srcu);
1204 
1205 	list_for_each_entry_srcu(ns, &head->list, siblings,
1206 				 srcu_read_lock_held(&head->srcu)) {
1207 		/*
1208 		 * Ensure that ns path disk node is already added otherwise we
1209 		 * may get invalid kobj name for target
1210 		 */
1211 		if (!test_bit(GD_ADDED, &ns->disk->state))
1212 			continue;
1213 
1214 		/*
1215 		 * Avoid creating link if it already exists for the given path.
1216 		 * When path ana state transitions from optimized to non-
1217 		 * optimized or vice-versa, the nvme_mpath_set_live() is
1218 		 * invoked which in truns call this function. Now if the sysfs
1219 		 * link already exists for the given path and we attempt to re-
1220 		 * create the link then sysfs code would warn about it loudly.
1221 		 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1222 		 * that we're not creating duplicate link.
1223 		 * The test_and_set_bit() is used because it is protecting
1224 		 * against multiple nvme paths being simultaneously added.
1225 		 */
1226 		if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1227 			continue;
1228 
1229 		target = disk_to_dev(ns->disk);
1230 		/*
1231 		 * Create sysfs link from head gendisk kobject @kobj to the
1232 		 * ns path gendisk kobject @target->kobj.
1233 		 */
1234 		rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1235 				&target->kobj, dev_name(target));
1236 		if (unlikely(rc)) {
1237 			dev_err(disk_to_dev(ns->head->disk),
1238 					"failed to create link to %s\n",
1239 					dev_name(target));
1240 			clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1241 		}
1242 	}
1243 
1244 	srcu_read_unlock(&head->srcu, srcu_idx);
1245 }
1246 
1247 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1248 {
1249 	struct device *target;
1250 	struct kobject *kobj;
1251 
1252 	if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1253 		return;
1254 
1255 	target = disk_to_dev(ns->disk);
1256 	kobj = &disk_to_dev(ns->head->disk)->kobj;
1257 	sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1258 			dev_name(target));
1259 	clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1260 }
1261 
1262 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1263 {
1264 	if (nvme_ctrl_use_ana(ns->ctrl)) {
1265 		struct nvme_ana_group_desc desc = {
1266 			.grpid = anagrpid,
1267 			.state = 0,
1268 		};
1269 
1270 		mutex_lock(&ns->ctrl->ana_lock);
1271 		ns->ana_grpid = le32_to_cpu(anagrpid);
1272 		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1273 		mutex_unlock(&ns->ctrl->ana_lock);
1274 		if (desc.state) {
1275 			/* found the group desc: update */
1276 			nvme_update_ns_ana_state(&desc, ns);
1277 		} else {
1278 			/* group desc not found: trigger a re-read */
1279 			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1280 			queue_work(nvme_wq, &ns->ctrl->ana_work);
1281 		}
1282 	} else {
1283 		ns->ana_state = NVME_ANA_OPTIMIZED;
1284 		nvme_mpath_set_live(ns);
1285 	}
1286 
1287 #ifdef CONFIG_BLK_DEV_ZONED
1288 	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1289 		ns->head->disk->nr_zones = ns->disk->nr_zones;
1290 #endif
1291 }
1292 
1293 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1294 {
1295 	bool remove = false;
1296 
1297 	if (!head->disk)
1298 		return;
1299 
1300 	mutex_lock(&head->subsys->lock);
1301 	/*
1302 	 * We are called when all paths have been removed, and at that point
1303 	 * head->list is expected to be empty. However, nvme_remove_ns() and
1304 	 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1305 	 * removal_secs is configured, it is possible that by the time we reach
1306 	 * this point, head->list may no longer be empty. Therefore, we recheck
1307 	 * head->list here. If it is no longer empty then we skip enqueuing the
1308 	 * delayed head removal work.
1309 	 */
1310 	if (!list_empty(&head->list))
1311 		goto out;
1312 
1313 	if (head->delayed_removal_secs) {
1314 		/*
1315 		 * Ensure that no one could remove this module while the head
1316 		 * remove work is pending.
1317 		 */
1318 		if (!try_module_get(THIS_MODULE))
1319 			goto out;
1320 		mod_delayed_work(nvme_wq, &head->remove_work,
1321 				head->delayed_removal_secs * HZ);
1322 	} else {
1323 		list_del_init(&head->entry);
1324 		remove = true;
1325 	}
1326 out:
1327 	mutex_unlock(&head->subsys->lock);
1328 	if (remove)
1329 		nvme_remove_head(head);
1330 }
1331 
1332 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1333 {
1334 	if (!head->disk)
1335 		return;
1336 	/* make sure all pending bios are cleaned up */
1337 	kblockd_schedule_work(&head->requeue_work);
1338 	flush_work(&head->requeue_work);
1339 	flush_work(&head->partition_scan_work);
1340 	put_disk(head->disk);
1341 }
1342 
1343 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1344 {
1345 	mutex_init(&ctrl->ana_lock);
1346 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1347 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1348 }
1349 
1350 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1351 {
1352 	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1353 	size_t ana_log_size;
1354 	int error = 0;
1355 
1356 	/* check if multipath is enabled and we have the capability */
1357 	if (!multipath || !ctrl->subsys ||
1358 	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1359 		return 0;
1360 
1361 	/* initialize this in the identify path to cover controller resets */
1362 	atomic_set(&ctrl->nr_active, 0);
1363 
1364 	if (!ctrl->max_namespaces ||
1365 	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1366 		dev_err(ctrl->device,
1367 			"Invalid MNAN value %u\n", ctrl->max_namespaces);
1368 		return -EINVAL;
1369 	}
1370 
1371 	ctrl->anacap = id->anacap;
1372 	ctrl->anatt = id->anatt;
1373 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1374 	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1375 
1376 	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1377 		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1378 		ctrl->max_namespaces * sizeof(__le32);
1379 	if (ana_log_size > max_transfer_size) {
1380 		dev_err(ctrl->device,
1381 			"ANA log page size (%zd) larger than MDTS (%zd).\n",
1382 			ana_log_size, max_transfer_size);
1383 		dev_err(ctrl->device, "disabling ANA support.\n");
1384 		goto out_uninit;
1385 	}
1386 	if (ana_log_size > ctrl->ana_log_size) {
1387 		nvme_mpath_stop(ctrl);
1388 		nvme_mpath_uninit(ctrl);
1389 		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1390 		if (!ctrl->ana_log_buf)
1391 			return -ENOMEM;
1392 	}
1393 	ctrl->ana_log_size = ana_log_size;
1394 	error = nvme_read_ana_log(ctrl);
1395 	if (error)
1396 		goto out_uninit;
1397 	return 0;
1398 
1399 out_uninit:
1400 	nvme_mpath_uninit(ctrl);
1401 	return error;
1402 }
1403 
1404 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1405 {
1406 	kvfree(ctrl->ana_log_buf);
1407 	ctrl->ana_log_buf = NULL;
1408 	ctrl->ana_log_size = 0;
1409 }
1410