1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2017-2018 Christoph Hellwig.
4 */
5
6 #include <linux/backing-dev.h>
7 #include <linux/moduleparam.h>
8 #include <linux/vmalloc.h>
9 #include <trace/events/block.h>
10 #include "nvme.h"
11
12 bool multipath = true;
13 static bool multipath_always_on;
14
multipath_param_set(const char * val,const struct kernel_param * kp)15 static int multipath_param_set(const char *val, const struct kernel_param *kp)
16 {
17 int ret;
18 bool *arg = kp->arg;
19
20 ret = param_set_bool(val, kp);
21 if (ret)
22 return ret;
23
24 if (multipath_always_on && !*arg) {
25 pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 *arg = true;
27 return -EINVAL;
28 }
29
30 return 0;
31 }
32
33 static const struct kernel_param_ops multipath_param_ops = {
34 .set = multipath_param_set,
35 .get = param_get_bool,
36 };
37
38 module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39 MODULE_PARM_DESC(multipath,
40 "turn on native support for multiple controllers per subsystem");
41
multipath_always_on_set(const char * val,const struct kernel_param * kp)42 static int multipath_always_on_set(const char *val,
43 const struct kernel_param *kp)
44 {
45 int ret;
46 bool *arg = kp->arg;
47
48 ret = param_set_bool(val, kp);
49 if (ret < 0)
50 return ret;
51
52 if (*arg)
53 multipath = true;
54
55 return 0;
56 }
57
58 static const struct kernel_param_ops multipath_always_on_ops = {
59 .set = multipath_always_on_set,
60 .get = param_get_bool,
61 };
62
63 module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 &multipath_always_on, 0444);
65 MODULE_PARM_DESC(multipath_always_on,
66 "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67
68 static const char *nvme_iopolicy_names[] = {
69 [NVME_IOPOLICY_NUMA] = "numa",
70 [NVME_IOPOLICY_RR] = "round-robin",
71 [NVME_IOPOLICY_QD] = "queue-depth",
72 };
73
74 static int iopolicy = NVME_IOPOLICY_NUMA;
75
nvme_set_iopolicy(const char * val,const struct kernel_param * kp)76 static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
77 {
78 if (!val)
79 return -EINVAL;
80 if (!strncmp(val, "numa", 4))
81 iopolicy = NVME_IOPOLICY_NUMA;
82 else if (!strncmp(val, "round-robin", 11))
83 iopolicy = NVME_IOPOLICY_RR;
84 else if (!strncmp(val, "queue-depth", 11))
85 iopolicy = NVME_IOPOLICY_QD;
86 else
87 return -EINVAL;
88
89 return 0;
90 }
91
nvme_get_iopolicy(char * buf,const struct kernel_param * kp)92 static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
93 {
94 return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
95 }
96
97 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
98 &iopolicy, 0644);
99 MODULE_PARM_DESC(iopolicy,
100 "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
101
nvme_mpath_default_iopolicy(struct nvme_subsystem * subsys)102 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
103 {
104 subsys->iopolicy = iopolicy;
105 }
106
nvme_mpath_unfreeze(struct nvme_subsystem * subsys)107 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
108 {
109 struct nvme_ns_head *h;
110
111 lockdep_assert_held(&subsys->lock);
112 list_for_each_entry(h, &subsys->nsheads, entry)
113 if (h->disk)
114 blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
115 }
116
nvme_mpath_wait_freeze(struct nvme_subsystem * subsys)117 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
118 {
119 struct nvme_ns_head *h;
120
121 lockdep_assert_held(&subsys->lock);
122 list_for_each_entry(h, &subsys->nsheads, entry)
123 if (h->disk)
124 blk_mq_freeze_queue_wait(h->disk->queue);
125 }
126
nvme_mpath_start_freeze(struct nvme_subsystem * subsys)127 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
128 {
129 struct nvme_ns_head *h;
130
131 lockdep_assert_held(&subsys->lock);
132 list_for_each_entry(h, &subsys->nsheads, entry)
133 if (h->disk)
134 blk_freeze_queue_start(h->disk->queue);
135 }
136
nvme_failover_req(struct request * req)137 void nvme_failover_req(struct request *req)
138 {
139 struct nvme_ns *ns = req->q->queuedata;
140 u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
141 unsigned long flags;
142 struct bio *bio;
143
144 nvme_mpath_clear_current_path(ns);
145
146 /*
147 * If we got back an ANA error, we know the controller is alive but not
148 * ready to serve this namespace. Kick of a re-read of the ANA
149 * information page, and just try any other available path for now.
150 */
151 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
152 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
153 queue_work(nvme_wq, &ns->ctrl->ana_work);
154 }
155
156 spin_lock_irqsave(&ns->head->requeue_lock, flags);
157 for (bio = req->bio; bio; bio = bio->bi_next) {
158 bio_set_dev(bio, ns->head->disk->part0);
159 if (bio->bi_opf & REQ_POLLED) {
160 bio->bi_opf &= ~REQ_POLLED;
161 bio->bi_cookie = BLK_QC_T_NONE;
162 }
163 /*
164 * The alternate request queue that we may end up submitting
165 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
166 * will fail the I/O immediately with EAGAIN to the issuer.
167 * We are not in the issuer context which cannot block. Clear
168 * the flag to avoid spurious EAGAIN I/O failures.
169 */
170 bio->bi_opf &= ~REQ_NOWAIT;
171 }
172 blk_steal_bios(&ns->head->requeue_list, req);
173 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
174
175 nvme_req(req)->status = 0;
176 nvme_end_req(req);
177 kblockd_schedule_work(&ns->head->requeue_work);
178 }
179
nvme_mpath_start_request(struct request * rq)180 void nvme_mpath_start_request(struct request *rq)
181 {
182 struct nvme_ns *ns = rq->q->queuedata;
183 struct gendisk *disk = ns->head->disk;
184
185 if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
186 !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
187 atomic_inc(&ns->ctrl->nr_active);
188 nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
189 }
190
191 if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) ||
192 (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
193 return;
194
195 nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
196 nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
197 jiffies);
198 }
199 EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
200
nvme_mpath_end_request(struct request * rq)201 void nvme_mpath_end_request(struct request *rq)
202 {
203 struct nvme_ns *ns = rq->q->queuedata;
204
205 if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
206 atomic_dec_if_positive(&ns->ctrl->nr_active);
207
208 if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
209 return;
210 bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
211 blk_rq_bytes(rq) >> SECTOR_SHIFT,
212 nvme_req(rq)->start_time);
213 }
214
nvme_kick_requeue_lists(struct nvme_ctrl * ctrl)215 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
216 {
217 struct nvme_ns *ns;
218 int srcu_idx;
219
220 srcu_idx = srcu_read_lock(&ctrl->srcu);
221 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
222 srcu_read_lock_held(&ctrl->srcu)) {
223 if (!ns->head->disk)
224 continue;
225 kblockd_schedule_work(&ns->head->requeue_work);
226 if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
227 disk_uevent(ns->head->disk, KOBJ_CHANGE);
228 }
229 srcu_read_unlock(&ctrl->srcu, srcu_idx);
230 }
231
232 static const char *nvme_ana_state_names[] = {
233 [0] = "invalid state",
234 [NVME_ANA_OPTIMIZED] = "optimized",
235 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
236 [NVME_ANA_INACCESSIBLE] = "inaccessible",
237 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
238 [NVME_ANA_CHANGE] = "change",
239 };
240
nvme_mpath_clear_current_path(struct nvme_ns * ns)241 bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
242 {
243 struct nvme_ns_head *head = ns->head;
244 bool changed = false;
245 int node;
246
247 if (!head)
248 goto out;
249
250 for_each_node(node) {
251 if (ns == rcu_access_pointer(head->current_path[node])) {
252 rcu_assign_pointer(head->current_path[node], NULL);
253 changed = true;
254 }
255 }
256 out:
257 return changed;
258 }
259
nvme_mpath_clear_ctrl_paths(struct nvme_ctrl * ctrl)260 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
261 {
262 struct nvme_ns *ns;
263 int srcu_idx;
264
265 srcu_idx = srcu_read_lock(&ctrl->srcu);
266 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
267 srcu_read_lock_held(&ctrl->srcu)) {
268 nvme_mpath_clear_current_path(ns);
269 kblockd_schedule_work(&ns->head->requeue_work);
270 }
271 srcu_read_unlock(&ctrl->srcu, srcu_idx);
272 }
273
nvme_mpath_revalidate_paths(struct nvme_ns * ns)274 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
275 {
276 struct nvme_ns_head *head = ns->head;
277 sector_t capacity = get_capacity(head->disk);
278 int node;
279 int srcu_idx;
280
281 srcu_idx = srcu_read_lock(&head->srcu);
282 list_for_each_entry_srcu(ns, &head->list, siblings,
283 srcu_read_lock_held(&head->srcu)) {
284 if (capacity != get_capacity(ns->disk))
285 clear_bit(NVME_NS_READY, &ns->flags);
286 }
287 srcu_read_unlock(&head->srcu, srcu_idx);
288
289 for_each_node(node)
290 rcu_assign_pointer(head->current_path[node], NULL);
291 kblockd_schedule_work(&head->requeue_work);
292 }
293
nvme_path_is_disabled(struct nvme_ns * ns)294 static bool nvme_path_is_disabled(struct nvme_ns *ns)
295 {
296 enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
297
298 /*
299 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
300 * still be able to complete assuming that the controller is connected.
301 * Otherwise it will fail immediately and return to the requeue list.
302 */
303 if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
304 return true;
305 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
306 !test_bit(NVME_NS_READY, &ns->flags))
307 return true;
308 return false;
309 }
310
__nvme_find_path(struct nvme_ns_head * head,int node)311 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
312 {
313 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
314 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
315
316 list_for_each_entry_srcu(ns, &head->list, siblings,
317 srcu_read_lock_held(&head->srcu)) {
318 if (nvme_path_is_disabled(ns))
319 continue;
320
321 if (ns->ctrl->numa_node != NUMA_NO_NODE &&
322 READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
323 distance = node_distance(node, ns->ctrl->numa_node);
324 else
325 distance = LOCAL_DISTANCE;
326
327 switch (ns->ana_state) {
328 case NVME_ANA_OPTIMIZED:
329 if (distance < found_distance) {
330 found_distance = distance;
331 found = ns;
332 }
333 break;
334 case NVME_ANA_NONOPTIMIZED:
335 if (distance < fallback_distance) {
336 fallback_distance = distance;
337 fallback = ns;
338 }
339 break;
340 default:
341 break;
342 }
343 }
344
345 if (!found)
346 found = fallback;
347 if (found)
348 rcu_assign_pointer(head->current_path[node], found);
349 return found;
350 }
351
nvme_next_ns(struct nvme_ns_head * head,struct nvme_ns * ns)352 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
353 struct nvme_ns *ns)
354 {
355 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
356 siblings);
357 if (ns)
358 return ns;
359 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
360 }
361
nvme_round_robin_path(struct nvme_ns_head * head)362 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
363 {
364 struct nvme_ns *ns, *found = NULL;
365 int node = numa_node_id();
366 struct nvme_ns *old = srcu_dereference(head->current_path[node],
367 &head->srcu);
368
369 if (unlikely(!old))
370 return __nvme_find_path(head, node);
371
372 if (list_is_singular(&head->list)) {
373 if (nvme_path_is_disabled(old))
374 return NULL;
375 return old;
376 }
377
378 for (ns = nvme_next_ns(head, old);
379 ns && ns != old;
380 ns = nvme_next_ns(head, ns)) {
381 if (nvme_path_is_disabled(ns))
382 continue;
383
384 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
385 found = ns;
386 goto out;
387 }
388 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
389 found = ns;
390 }
391
392 /*
393 * The loop above skips the current path for round-robin semantics.
394 * Fall back to the current path if either:
395 * - no other optimized path found and current is optimized,
396 * - no other usable path found and current is usable.
397 */
398 if (!nvme_path_is_disabled(old) &&
399 (old->ana_state == NVME_ANA_OPTIMIZED ||
400 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
401 return old;
402
403 if (!found)
404 return NULL;
405 out:
406 rcu_assign_pointer(head->current_path[node], found);
407 return found;
408 }
409
nvme_queue_depth_path(struct nvme_ns_head * head)410 static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
411 {
412 struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
413 unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
414 unsigned int depth;
415
416 list_for_each_entry_srcu(ns, &head->list, siblings,
417 srcu_read_lock_held(&head->srcu)) {
418 if (nvme_path_is_disabled(ns))
419 continue;
420
421 depth = atomic_read(&ns->ctrl->nr_active);
422
423 switch (ns->ana_state) {
424 case NVME_ANA_OPTIMIZED:
425 if (depth < min_depth_opt) {
426 min_depth_opt = depth;
427 best_opt = ns;
428 }
429 break;
430 case NVME_ANA_NONOPTIMIZED:
431 if (depth < min_depth_nonopt) {
432 min_depth_nonopt = depth;
433 best_nonopt = ns;
434 }
435 break;
436 default:
437 break;
438 }
439
440 if (min_depth_opt == 0)
441 return best_opt;
442 }
443
444 return best_opt ? best_opt : best_nonopt;
445 }
446
nvme_path_is_optimized(struct nvme_ns * ns)447 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
448 {
449 return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
450 ns->ana_state == NVME_ANA_OPTIMIZED;
451 }
452
nvme_numa_path(struct nvme_ns_head * head)453 static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
454 {
455 int node = numa_node_id();
456 struct nvme_ns *ns;
457
458 ns = srcu_dereference(head->current_path[node], &head->srcu);
459 if (unlikely(!ns))
460 return __nvme_find_path(head, node);
461 if (unlikely(!nvme_path_is_optimized(ns)))
462 return __nvme_find_path(head, node);
463 return ns;
464 }
465
nvme_find_path(struct nvme_ns_head * head)466 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
467 {
468 switch (READ_ONCE(head->subsys->iopolicy)) {
469 case NVME_IOPOLICY_QD:
470 return nvme_queue_depth_path(head);
471 case NVME_IOPOLICY_RR:
472 return nvme_round_robin_path(head);
473 default:
474 return nvme_numa_path(head);
475 }
476 }
477
nvme_available_path(struct nvme_ns_head * head)478 static bool nvme_available_path(struct nvme_ns_head *head)
479 {
480 struct nvme_ns *ns;
481
482 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
483 return false;
484
485 list_for_each_entry_srcu(ns, &head->list, siblings,
486 srcu_read_lock_held(&head->srcu)) {
487 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
488 continue;
489 switch (nvme_ctrl_state(ns->ctrl)) {
490 case NVME_CTRL_LIVE:
491 case NVME_CTRL_RESETTING:
492 case NVME_CTRL_CONNECTING:
493 return true;
494 default:
495 break;
496 }
497 }
498
499 /*
500 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
501 * not immediately fail I/O. Instead, requeue the I/O for the configured
502 * duration, anticipating that if there's a transient link failure then
503 * it may recover within this time window. This parameter is exported to
504 * userspace via sysfs, and its default value is zero. It is internally
505 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
506 * non-zero, this flag is set to true. When zero, the flag is cleared.
507 */
508 return nvme_mpath_queue_if_no_path(head);
509 }
510
nvme_ns_head_submit_bio(struct bio * bio)511 static void nvme_ns_head_submit_bio(struct bio *bio)
512 {
513 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
514 struct device *dev = disk_to_dev(head->disk);
515 struct nvme_ns *ns;
516 int srcu_idx;
517
518 /*
519 * The namespace might be going away and the bio might be moved to a
520 * different queue via blk_steal_bios(), so we need to use the bio_split
521 * pool from the original queue to allocate the bvecs from.
522 */
523 bio = bio_split_to_limits(bio);
524 if (!bio)
525 return;
526
527 srcu_idx = srcu_read_lock(&head->srcu);
528 ns = nvme_find_path(head);
529 if (likely(ns)) {
530 bio_set_dev(bio, ns->disk->part0);
531 bio->bi_opf |= REQ_NVME_MPATH;
532 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
533 bio->bi_iter.bi_sector);
534 submit_bio_noacct(bio);
535 } else if (nvme_available_path(head)) {
536 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
537
538 spin_lock_irq(&head->requeue_lock);
539 bio_list_add(&head->requeue_list, bio);
540 spin_unlock_irq(&head->requeue_lock);
541 } else {
542 dev_warn_ratelimited(dev, "no available path - failing I/O\n");
543
544 bio_io_error(bio);
545 }
546
547 srcu_read_unlock(&head->srcu, srcu_idx);
548 }
549
nvme_ns_head_open(struct gendisk * disk,blk_mode_t mode)550 static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
551 {
552 if (!nvme_tryget_ns_head(disk->private_data))
553 return -ENXIO;
554 return 0;
555 }
556
nvme_ns_head_release(struct gendisk * disk)557 static void nvme_ns_head_release(struct gendisk *disk)
558 {
559 nvme_put_ns_head(disk->private_data);
560 }
561
nvme_ns_head_get_unique_id(struct gendisk * disk,u8 id[16],enum blk_unique_id type)562 static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
563 enum blk_unique_id type)
564 {
565 struct nvme_ns_head *head = disk->private_data;
566 struct nvme_ns *ns;
567 int srcu_idx, ret = -EWOULDBLOCK;
568
569 srcu_idx = srcu_read_lock(&head->srcu);
570 ns = nvme_find_path(head);
571 if (ns)
572 ret = nvme_ns_get_unique_id(ns, id, type);
573 srcu_read_unlock(&head->srcu, srcu_idx);
574 return ret;
575 }
576
577 #ifdef CONFIG_BLK_DEV_ZONED
nvme_ns_head_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)578 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
579 unsigned int nr_zones, report_zones_cb cb, void *data)
580 {
581 struct nvme_ns_head *head = disk->private_data;
582 struct nvme_ns *ns;
583 int srcu_idx, ret = -EWOULDBLOCK;
584
585 srcu_idx = srcu_read_lock(&head->srcu);
586 ns = nvme_find_path(head);
587 if (ns)
588 ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
589 srcu_read_unlock(&head->srcu, srcu_idx);
590 return ret;
591 }
592 #else
593 #define nvme_ns_head_report_zones NULL
594 #endif /* CONFIG_BLK_DEV_ZONED */
595
596 const struct block_device_operations nvme_ns_head_ops = {
597 .owner = THIS_MODULE,
598 .submit_bio = nvme_ns_head_submit_bio,
599 .open = nvme_ns_head_open,
600 .release = nvme_ns_head_release,
601 .ioctl = nvme_ns_head_ioctl,
602 .compat_ioctl = blkdev_compat_ptr_ioctl,
603 .getgeo = nvme_getgeo,
604 .get_unique_id = nvme_ns_head_get_unique_id,
605 .report_zones = nvme_ns_head_report_zones,
606 .pr_ops = &nvme_pr_ops,
607 };
608
cdev_to_ns_head(struct cdev * cdev)609 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
610 {
611 return container_of(cdev, struct nvme_ns_head, cdev);
612 }
613
nvme_ns_head_chr_open(struct inode * inode,struct file * file)614 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
615 {
616 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
617 return -ENXIO;
618 return 0;
619 }
620
nvme_ns_head_chr_release(struct inode * inode,struct file * file)621 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
622 {
623 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
624 return 0;
625 }
626
627 static const struct file_operations nvme_ns_head_chr_fops = {
628 .owner = THIS_MODULE,
629 .open = nvme_ns_head_chr_open,
630 .release = nvme_ns_head_chr_release,
631 .unlocked_ioctl = nvme_ns_head_chr_ioctl,
632 .compat_ioctl = compat_ptr_ioctl,
633 .uring_cmd = nvme_ns_head_chr_uring_cmd,
634 .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
635 };
636
nvme_add_ns_head_cdev(struct nvme_ns_head * head)637 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
638 {
639 int ret;
640
641 head->cdev_device.parent = &head->subsys->dev;
642 ret = dev_set_name(&head->cdev_device, "ng%dn%d",
643 head->subsys->instance, head->instance);
644 if (ret)
645 return ret;
646 ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
647 &nvme_ns_head_chr_fops, THIS_MODULE);
648 return ret;
649 }
650
nvme_partition_scan_work(struct work_struct * work)651 static void nvme_partition_scan_work(struct work_struct *work)
652 {
653 struct nvme_ns_head *head =
654 container_of(work, struct nvme_ns_head, partition_scan_work);
655
656 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
657 &head->disk->state)))
658 return;
659
660 mutex_lock(&head->disk->open_mutex);
661 bdev_disk_changed(head->disk, false);
662 mutex_unlock(&head->disk->open_mutex);
663 }
664
nvme_requeue_work(struct work_struct * work)665 static void nvme_requeue_work(struct work_struct *work)
666 {
667 struct nvme_ns_head *head =
668 container_of(work, struct nvme_ns_head, requeue_work);
669 struct bio *bio, *next;
670
671 spin_lock_irq(&head->requeue_lock);
672 next = bio_list_get(&head->requeue_list);
673 spin_unlock_irq(&head->requeue_lock);
674
675 while ((bio = next) != NULL) {
676 next = bio->bi_next;
677 bio->bi_next = NULL;
678
679 submit_bio_noacct(bio);
680 }
681 }
682
nvme_remove_head(struct nvme_ns_head * head)683 static void nvme_remove_head(struct nvme_ns_head *head)
684 {
685 if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
686 /*
687 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
688 * to allow multipath to fail all I/O.
689 */
690 kblockd_schedule_work(&head->requeue_work);
691
692 nvme_cdev_del(&head->cdev, &head->cdev_device);
693 synchronize_srcu(&head->srcu);
694 del_gendisk(head->disk);
695 }
696 nvme_put_ns_head(head);
697 }
698
nvme_remove_head_work(struct work_struct * work)699 static void nvme_remove_head_work(struct work_struct *work)
700 {
701 struct nvme_ns_head *head = container_of(to_delayed_work(work),
702 struct nvme_ns_head, remove_work);
703 bool remove = false;
704
705 mutex_lock(&head->subsys->lock);
706 if (list_empty(&head->list)) {
707 list_del_init(&head->entry);
708 remove = true;
709 }
710 mutex_unlock(&head->subsys->lock);
711 if (remove)
712 nvme_remove_head(head);
713
714 module_put(THIS_MODULE);
715 }
716
nvme_mpath_alloc_disk(struct nvme_ctrl * ctrl,struct nvme_ns_head * head)717 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
718 {
719 struct queue_limits lim;
720
721 mutex_init(&head->lock);
722 bio_list_init(&head->requeue_list);
723 spin_lock_init(&head->requeue_lock);
724 INIT_WORK(&head->requeue_work, nvme_requeue_work);
725 INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
726 INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
727 head->delayed_removal_secs = 0;
728
729 /*
730 * If "multipath_always_on" is enabled, a multipath node is added
731 * regardless of whether the disk is single/multi ported, and whether
732 * the namespace is shared or private. If "multipath_always_on" is not
733 * enabled, a multipath node is added only if the subsystem supports
734 * multiple controllers and the "multipath" option is configured. In
735 * either case, for private namespaces, we ensure that the NSID is
736 * unique.
737 */
738 if (!multipath_always_on) {
739 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
740 !multipath)
741 return 0;
742 }
743
744 if (!nvme_is_unique_nsid(ctrl, head))
745 return 0;
746
747 blk_set_stacking_limits(&lim);
748 lim.dma_alignment = 3;
749 lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
750 BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
751 if (head->ids.csi == NVME_CSI_ZNS)
752 lim.features |= BLK_FEAT_ZONED;
753
754 head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
755 if (IS_ERR(head->disk))
756 return PTR_ERR(head->disk);
757 head->disk->fops = &nvme_ns_head_ops;
758 head->disk->private_data = head;
759
760 /*
761 * We need to suppress the partition scan from occuring within the
762 * controller's scan_work context. If a path error occurs here, the IO
763 * will wait until a path becomes available or all paths are torn down,
764 * but that action also occurs within scan_work, so it would deadlock.
765 * Defer the partition scan to a different context that does not block
766 * scan_work.
767 */
768 set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
769 sprintf(head->disk->disk_name, "nvme%dn%d",
770 ctrl->subsys->instance, head->instance);
771 nvme_tryget_ns_head(head);
772 return 0;
773 }
774
nvme_mpath_set_live(struct nvme_ns * ns)775 static void nvme_mpath_set_live(struct nvme_ns *ns)
776 {
777 struct nvme_ns_head *head = ns->head;
778 int rc;
779
780 if (!head->disk)
781 return;
782
783 /*
784 * test_and_set_bit() is used because it is protecting against two nvme
785 * paths simultaneously calling device_add_disk() on the same namespace
786 * head.
787 */
788 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
789 rc = device_add_disk(&head->subsys->dev, head->disk,
790 nvme_ns_attr_groups);
791 if (rc) {
792 clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
793 return;
794 }
795 nvme_add_ns_head_cdev(head);
796 kblockd_schedule_work(&head->partition_scan_work);
797 }
798
799 nvme_mpath_add_sysfs_link(ns->head);
800
801 mutex_lock(&head->lock);
802 if (nvme_path_is_optimized(ns)) {
803 int node, srcu_idx;
804
805 srcu_idx = srcu_read_lock(&head->srcu);
806 for_each_online_node(node)
807 __nvme_find_path(head, node);
808 srcu_read_unlock(&head->srcu, srcu_idx);
809 }
810 mutex_unlock(&head->lock);
811
812 synchronize_srcu(&head->srcu);
813 kblockd_schedule_work(&head->requeue_work);
814 }
815
nvme_parse_ana_log(struct nvme_ctrl * ctrl,void * data,int (* cb)(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc *,void *))816 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
817 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
818 void *))
819 {
820 void *base = ctrl->ana_log_buf;
821 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
822 int error, i;
823
824 lockdep_assert_held(&ctrl->ana_lock);
825
826 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
827 struct nvme_ana_group_desc *desc = base + offset;
828 u32 nr_nsids;
829 size_t nsid_buf_size;
830
831 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
832 return -EINVAL;
833
834 nr_nsids = le32_to_cpu(desc->nnsids);
835 nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
836
837 if (WARN_ON_ONCE(desc->grpid == 0))
838 return -EINVAL;
839 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
840 return -EINVAL;
841 if (WARN_ON_ONCE(desc->state == 0))
842 return -EINVAL;
843 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
844 return -EINVAL;
845
846 offset += sizeof(*desc);
847 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
848 return -EINVAL;
849
850 error = cb(ctrl, desc, data);
851 if (error)
852 return error;
853
854 offset += nsid_buf_size;
855 }
856
857 return 0;
858 }
859
nvme_state_is_live(enum nvme_ana_state state)860 static inline bool nvme_state_is_live(enum nvme_ana_state state)
861 {
862 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
863 }
864
nvme_update_ns_ana_state(struct nvme_ana_group_desc * desc,struct nvme_ns * ns)865 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
866 struct nvme_ns *ns)
867 {
868 ns->ana_grpid = le32_to_cpu(desc->grpid);
869 ns->ana_state = desc->state;
870 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
871 /*
872 * nvme_mpath_set_live() will trigger I/O to the multipath path device
873 * and in turn to this path device. However we cannot accept this I/O
874 * if the controller is not live. This may deadlock if called from
875 * nvme_mpath_init_identify() and the ctrl will never complete
876 * initialization, preventing I/O from completing. For this case we
877 * will reprocess the ANA log page in nvme_mpath_update() once the
878 * controller is ready.
879 */
880 if (nvme_state_is_live(ns->ana_state) &&
881 nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
882 nvme_mpath_set_live(ns);
883 else {
884 /*
885 * Add sysfs link from multipath head gendisk node to path
886 * device gendisk node.
887 * If path's ana state is live (i.e. state is either optimized
888 * or non-optimized) while we alloc the ns then sysfs link would
889 * be created from nvme_mpath_set_live(). In that case we would
890 * not fallthrough this code path. However for the path's ana
891 * state other than live, we call nvme_mpath_set_live() only
892 * after ana state transitioned to the live state. But we still
893 * want to create the sysfs link from head node to a path device
894 * irrespctive of the path's ana state.
895 * If we reach through here then it means that path's ana state
896 * is not live but still create the sysfs link to this path from
897 * head node if head node of the path has already come alive.
898 */
899 if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
900 nvme_mpath_add_sysfs_link(ns->head);
901 }
902 }
903
nvme_update_ana_state(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc * desc,void * data)904 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
905 struct nvme_ana_group_desc *desc, void *data)
906 {
907 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
908 unsigned *nr_change_groups = data;
909 struct nvme_ns *ns;
910 int srcu_idx;
911
912 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
913 le32_to_cpu(desc->grpid),
914 nvme_ana_state_names[desc->state]);
915
916 if (desc->state == NVME_ANA_CHANGE)
917 (*nr_change_groups)++;
918
919 if (!nr_nsids)
920 return 0;
921
922 srcu_idx = srcu_read_lock(&ctrl->srcu);
923 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
924 srcu_read_lock_held(&ctrl->srcu)) {
925 unsigned nsid;
926 again:
927 nsid = le32_to_cpu(desc->nsids[n]);
928 if (ns->head->ns_id < nsid)
929 continue;
930 if (ns->head->ns_id == nsid)
931 nvme_update_ns_ana_state(desc, ns);
932 if (++n == nr_nsids)
933 break;
934 if (ns->head->ns_id > nsid)
935 goto again;
936 }
937 srcu_read_unlock(&ctrl->srcu, srcu_idx);
938 return 0;
939 }
940
nvme_read_ana_log(struct nvme_ctrl * ctrl)941 static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
942 {
943 u32 nr_change_groups = 0;
944 int error;
945
946 mutex_lock(&ctrl->ana_lock);
947 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
948 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
949 if (error) {
950 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
951 goto out_unlock;
952 }
953
954 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
955 nvme_update_ana_state);
956 if (error)
957 goto out_unlock;
958
959 /*
960 * In theory we should have an ANATT timer per group as they might enter
961 * the change state at different times. But that is a lot of overhead
962 * just to protect against a target that keeps entering new changes
963 * states while never finishing previous ones. But we'll still
964 * eventually time out once all groups are in change state, so this
965 * isn't a big deal.
966 *
967 * We also double the ANATT value to provide some slack for transports
968 * or AEN processing overhead.
969 */
970 if (nr_change_groups)
971 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
972 else
973 timer_delete_sync(&ctrl->anatt_timer);
974 out_unlock:
975 mutex_unlock(&ctrl->ana_lock);
976 return error;
977 }
978
nvme_ana_work(struct work_struct * work)979 static void nvme_ana_work(struct work_struct *work)
980 {
981 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
982
983 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
984 return;
985
986 nvme_read_ana_log(ctrl);
987 }
988
nvme_mpath_update(struct nvme_ctrl * ctrl)989 void nvme_mpath_update(struct nvme_ctrl *ctrl)
990 {
991 u32 nr_change_groups = 0;
992
993 if (!ctrl->ana_log_buf)
994 return;
995
996 mutex_lock(&ctrl->ana_lock);
997 nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
998 mutex_unlock(&ctrl->ana_lock);
999 }
1000
nvme_anatt_timeout(struct timer_list * t)1001 static void nvme_anatt_timeout(struct timer_list *t)
1002 {
1003 struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
1004
1005 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
1006 nvme_reset_ctrl(ctrl);
1007 }
1008
nvme_mpath_stop(struct nvme_ctrl * ctrl)1009 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
1010 {
1011 if (!nvme_ctrl_use_ana(ctrl))
1012 return;
1013 timer_delete_sync(&ctrl->anatt_timer);
1014 cancel_work_sync(&ctrl->ana_work);
1015 }
1016
1017 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
1018 struct device_attribute subsys_attr_##_name = \
1019 __ATTR(_name, _mode, _show, _store)
1020
nvme_subsys_iopolicy_show(struct device * dev,struct device_attribute * attr,char * buf)1021 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1022 struct device_attribute *attr, char *buf)
1023 {
1024 struct nvme_subsystem *subsys =
1025 container_of(dev, struct nvme_subsystem, dev);
1026
1027 return sysfs_emit(buf, "%s\n",
1028 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1029 }
1030
nvme_subsys_iopolicy_update(struct nvme_subsystem * subsys,int iopolicy)1031 static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1032 int iopolicy)
1033 {
1034 struct nvme_ctrl *ctrl;
1035 int old_iopolicy = READ_ONCE(subsys->iopolicy);
1036
1037 if (old_iopolicy == iopolicy)
1038 return;
1039
1040 WRITE_ONCE(subsys->iopolicy, iopolicy);
1041
1042 /* iopolicy changes clear the mpath by design */
1043 mutex_lock(&nvme_subsystems_lock);
1044 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1045 nvme_mpath_clear_ctrl_paths(ctrl);
1046 mutex_unlock(&nvme_subsystems_lock);
1047
1048 pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1049 subsys->subnqn,
1050 nvme_iopolicy_names[old_iopolicy],
1051 nvme_iopolicy_names[iopolicy]);
1052 }
1053
nvme_subsys_iopolicy_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1054 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1055 struct device_attribute *attr, const char *buf, size_t count)
1056 {
1057 struct nvme_subsystem *subsys =
1058 container_of(dev, struct nvme_subsystem, dev);
1059 int i;
1060
1061 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
1062 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
1063 nvme_subsys_iopolicy_update(subsys, i);
1064 return count;
1065 }
1066 }
1067
1068 return -EINVAL;
1069 }
1070 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1071 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1072
ana_grpid_show(struct device * dev,struct device_attribute * attr,char * buf)1073 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1074 char *buf)
1075 {
1076 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1077 }
1078 DEVICE_ATTR_RO(ana_grpid);
1079
ana_state_show(struct device * dev,struct device_attribute * attr,char * buf)1080 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1081 char *buf)
1082 {
1083 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1084
1085 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1086 }
1087 DEVICE_ATTR_RO(ana_state);
1088
queue_depth_show(struct device * dev,struct device_attribute * attr,char * buf)1089 static ssize_t queue_depth_show(struct device *dev,
1090 struct device_attribute *attr, char *buf)
1091 {
1092 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1093
1094 if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1095 return 0;
1096
1097 return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1098 }
1099 DEVICE_ATTR_RO(queue_depth);
1100
numa_nodes_show(struct device * dev,struct device_attribute * attr,char * buf)1101 static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1102 char *buf)
1103 {
1104 int node, srcu_idx;
1105 nodemask_t numa_nodes;
1106 struct nvme_ns *current_ns;
1107 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1108 struct nvme_ns_head *head = ns->head;
1109
1110 if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1111 return 0;
1112
1113 nodes_clear(numa_nodes);
1114
1115 srcu_idx = srcu_read_lock(&head->srcu);
1116 for_each_node(node) {
1117 current_ns = srcu_dereference(head->current_path[node],
1118 &head->srcu);
1119 if (ns == current_ns)
1120 node_set(node, numa_nodes);
1121 }
1122 srcu_read_unlock(&head->srcu, srcu_idx);
1123
1124 return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1125 }
1126 DEVICE_ATTR_RO(numa_nodes);
1127
delayed_removal_secs_show(struct device * dev,struct device_attribute * attr,char * buf)1128 static ssize_t delayed_removal_secs_show(struct device *dev,
1129 struct device_attribute *attr, char *buf)
1130 {
1131 struct gendisk *disk = dev_to_disk(dev);
1132 struct nvme_ns_head *head = disk->private_data;
1133 int ret;
1134
1135 mutex_lock(&head->subsys->lock);
1136 ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1137 mutex_unlock(&head->subsys->lock);
1138 return ret;
1139 }
1140
delayed_removal_secs_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)1141 static ssize_t delayed_removal_secs_store(struct device *dev,
1142 struct device_attribute *attr, const char *buf, size_t count)
1143 {
1144 struct gendisk *disk = dev_to_disk(dev);
1145 struct nvme_ns_head *head = disk->private_data;
1146 unsigned int sec;
1147 int ret;
1148
1149 ret = kstrtouint(buf, 0, &sec);
1150 if (ret < 0)
1151 return ret;
1152
1153 mutex_lock(&head->subsys->lock);
1154 head->delayed_removal_secs = sec;
1155 if (sec)
1156 set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1157 else
1158 clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1159 mutex_unlock(&head->subsys->lock);
1160 /*
1161 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1162 * by its reader.
1163 */
1164 synchronize_srcu(&head->srcu);
1165
1166 return count;
1167 }
1168
1169 DEVICE_ATTR_RW(delayed_removal_secs);
1170
nvme_lookup_ana_group_desc(struct nvme_ctrl * ctrl,struct nvme_ana_group_desc * desc,void * data)1171 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1172 struct nvme_ana_group_desc *desc, void *data)
1173 {
1174 struct nvme_ana_group_desc *dst = data;
1175
1176 if (desc->grpid != dst->grpid)
1177 return 0;
1178
1179 *dst = *desc;
1180 return -ENXIO; /* just break out of the loop */
1181 }
1182
nvme_mpath_add_sysfs_link(struct nvme_ns_head * head)1183 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1184 {
1185 struct device *target;
1186 int rc, srcu_idx;
1187 struct nvme_ns *ns;
1188 struct kobject *kobj;
1189
1190 /*
1191 * Ensure head disk node is already added otherwise we may get invalid
1192 * kobj for head disk node
1193 */
1194 if (!test_bit(GD_ADDED, &head->disk->state))
1195 return;
1196
1197 kobj = &disk_to_dev(head->disk)->kobj;
1198
1199 /*
1200 * loop through each ns chained through the head->list and create the
1201 * sysfs link from head node to the ns path node
1202 */
1203 srcu_idx = srcu_read_lock(&head->srcu);
1204
1205 list_for_each_entry_srcu(ns, &head->list, siblings,
1206 srcu_read_lock_held(&head->srcu)) {
1207 /*
1208 * Ensure that ns path disk node is already added otherwise we
1209 * may get invalid kobj name for target
1210 */
1211 if (!test_bit(GD_ADDED, &ns->disk->state))
1212 continue;
1213
1214 /*
1215 * Avoid creating link if it already exists for the given path.
1216 * When path ana state transitions from optimized to non-
1217 * optimized or vice-versa, the nvme_mpath_set_live() is
1218 * invoked which in truns call this function. Now if the sysfs
1219 * link already exists for the given path and we attempt to re-
1220 * create the link then sysfs code would warn about it loudly.
1221 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1222 * that we're not creating duplicate link.
1223 * The test_and_set_bit() is used because it is protecting
1224 * against multiple nvme paths being simultaneously added.
1225 */
1226 if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1227 continue;
1228
1229 target = disk_to_dev(ns->disk);
1230 /*
1231 * Create sysfs link from head gendisk kobject @kobj to the
1232 * ns path gendisk kobject @target->kobj.
1233 */
1234 rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1235 &target->kobj, dev_name(target));
1236 if (unlikely(rc)) {
1237 dev_err(disk_to_dev(ns->head->disk),
1238 "failed to create link to %s\n",
1239 dev_name(target));
1240 clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1241 }
1242 }
1243
1244 srcu_read_unlock(&head->srcu, srcu_idx);
1245 }
1246
nvme_mpath_remove_sysfs_link(struct nvme_ns * ns)1247 void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1248 {
1249 struct device *target;
1250 struct kobject *kobj;
1251
1252 if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1253 return;
1254
1255 target = disk_to_dev(ns->disk);
1256 kobj = &disk_to_dev(ns->head->disk)->kobj;
1257 sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1258 dev_name(target));
1259 clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1260 }
1261
nvme_mpath_add_disk(struct nvme_ns * ns,__le32 anagrpid)1262 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1263 {
1264 if (nvme_ctrl_use_ana(ns->ctrl)) {
1265 struct nvme_ana_group_desc desc = {
1266 .grpid = anagrpid,
1267 .state = 0,
1268 };
1269
1270 mutex_lock(&ns->ctrl->ana_lock);
1271 ns->ana_grpid = le32_to_cpu(anagrpid);
1272 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1273 mutex_unlock(&ns->ctrl->ana_lock);
1274 if (desc.state) {
1275 /* found the group desc: update */
1276 nvme_update_ns_ana_state(&desc, ns);
1277 } else {
1278 /* group desc not found: trigger a re-read */
1279 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1280 queue_work(nvme_wq, &ns->ctrl->ana_work);
1281 }
1282 } else {
1283 ns->ana_state = NVME_ANA_OPTIMIZED;
1284 nvme_mpath_set_live(ns);
1285 }
1286
1287 #ifdef CONFIG_BLK_DEV_ZONED
1288 if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1289 ns->head->disk->nr_zones = ns->disk->nr_zones;
1290 #endif
1291 }
1292
nvme_mpath_remove_disk(struct nvme_ns_head * head)1293 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1294 {
1295 bool remove = false;
1296
1297 if (!head->disk)
1298 return;
1299
1300 mutex_lock(&head->subsys->lock);
1301 /*
1302 * We are called when all paths have been removed, and at that point
1303 * head->list is expected to be empty. However, nvme_remove_ns() and
1304 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1305 * removal_secs is configured, it is possible that by the time we reach
1306 * this point, head->list may no longer be empty. Therefore, we recheck
1307 * head->list here. If it is no longer empty then we skip enqueuing the
1308 * delayed head removal work.
1309 */
1310 if (!list_empty(&head->list))
1311 goto out;
1312
1313 if (head->delayed_removal_secs) {
1314 /*
1315 * Ensure that no one could remove this module while the head
1316 * remove work is pending.
1317 */
1318 if (!try_module_get(THIS_MODULE))
1319 goto out;
1320 mod_delayed_work(nvme_wq, &head->remove_work,
1321 head->delayed_removal_secs * HZ);
1322 } else {
1323 list_del_init(&head->entry);
1324 remove = true;
1325 }
1326 out:
1327 mutex_unlock(&head->subsys->lock);
1328 if (remove)
1329 nvme_remove_head(head);
1330 }
1331
nvme_mpath_put_disk(struct nvme_ns_head * head)1332 void nvme_mpath_put_disk(struct nvme_ns_head *head)
1333 {
1334 if (!head->disk)
1335 return;
1336 /* make sure all pending bios are cleaned up */
1337 kblockd_schedule_work(&head->requeue_work);
1338 flush_work(&head->requeue_work);
1339 flush_work(&head->partition_scan_work);
1340 put_disk(head->disk);
1341 }
1342
nvme_mpath_init_ctrl(struct nvme_ctrl * ctrl)1343 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1344 {
1345 mutex_init(&ctrl->ana_lock);
1346 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1347 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1348 }
1349
nvme_mpath_init_identify(struct nvme_ctrl * ctrl,struct nvme_id_ctrl * id)1350 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1351 {
1352 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1353 size_t ana_log_size;
1354 int error = 0;
1355
1356 /* check if multipath is enabled and we have the capability */
1357 if (!multipath || !ctrl->subsys ||
1358 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1359 return 0;
1360
1361 /* initialize this in the identify path to cover controller resets */
1362 atomic_set(&ctrl->nr_active, 0);
1363
1364 if (!ctrl->max_namespaces ||
1365 ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1366 dev_err(ctrl->device,
1367 "Invalid MNAN value %u\n", ctrl->max_namespaces);
1368 return -EINVAL;
1369 }
1370
1371 ctrl->anacap = id->anacap;
1372 ctrl->anatt = id->anatt;
1373 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1374 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1375
1376 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1377 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1378 ctrl->max_namespaces * sizeof(__le32);
1379 if (ana_log_size > max_transfer_size) {
1380 dev_err(ctrl->device,
1381 "ANA log page size (%zd) larger than MDTS (%zd).\n",
1382 ana_log_size, max_transfer_size);
1383 dev_err(ctrl->device, "disabling ANA support.\n");
1384 goto out_uninit;
1385 }
1386 if (ana_log_size > ctrl->ana_log_size) {
1387 nvme_mpath_stop(ctrl);
1388 nvme_mpath_uninit(ctrl);
1389 ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1390 if (!ctrl->ana_log_buf)
1391 return -ENOMEM;
1392 }
1393 ctrl->ana_log_size = ana_log_size;
1394 error = nvme_read_ana_log(ctrl);
1395 if (error)
1396 goto out_uninit;
1397 return 0;
1398
1399 out_uninit:
1400 nvme_mpath_uninit(ctrl);
1401 return error;
1402 }
1403
nvme_mpath_uninit(struct nvme_ctrl * ctrl)1404 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1405 {
1406 kvfree(ctrl->ana_log_buf);
1407 ctrl->ana_log_buf = NULL;
1408 ctrl->ana_log_size = 0;
1409 }
1410