xref: /linux/drivers/nvme/target/core.c (revision a7f7f6248d9740d710fd6bd190293fe5e16410ac)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Common code for the NVMe target.
4  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/random.h>
9 #include <linux/rculist.h>
10 #include <linux/pci-p2pdma.h>
11 #include <linux/scatterlist.h>
12 
13 #define CREATE_TRACE_POINTS
14 #include "trace.h"
15 
16 #include "nvmet.h"
17 
18 struct workqueue_struct *buffered_io_wq;
19 static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
20 static DEFINE_IDA(cntlid_ida);
21 
22 /*
23  * This read/write semaphore is used to synchronize access to configuration
24  * information on a target system that will result in discovery log page
25  * information change for at least one host.
26  * The full list of resources to protected by this semaphore is:
27  *
28  *  - subsystems list
29  *  - per-subsystem allowed hosts list
30  *  - allow_any_host subsystem attribute
31  *  - nvmet_genctr
32  *  - the nvmet_transports array
33  *
34  * When updating any of those lists/structures write lock should be obtained,
35  * while when reading (popolating discovery log page or checking host-subsystem
36  * link) read lock is obtained to allow concurrent reads.
37  */
38 DECLARE_RWSEM(nvmet_config_sem);
39 
40 u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
41 u64 nvmet_ana_chgcnt;
42 DECLARE_RWSEM(nvmet_ana_sem);
43 
44 inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
45 {
46 	u16 status;
47 
48 	switch (errno) {
49 	case 0:
50 		status = NVME_SC_SUCCESS;
51 		break;
52 	case -ENOSPC:
53 		req->error_loc = offsetof(struct nvme_rw_command, length);
54 		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
55 		break;
56 	case -EREMOTEIO:
57 		req->error_loc = offsetof(struct nvme_rw_command, slba);
58 		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
59 		break;
60 	case -EOPNOTSUPP:
61 		req->error_loc = offsetof(struct nvme_common_command, opcode);
62 		switch (req->cmd->common.opcode) {
63 		case nvme_cmd_dsm:
64 		case nvme_cmd_write_zeroes:
65 			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
66 			break;
67 		default:
68 			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
69 		}
70 		break;
71 	case -ENODATA:
72 		req->error_loc = offsetof(struct nvme_rw_command, nsid);
73 		status = NVME_SC_ACCESS_DENIED;
74 		break;
75 	case -EIO:
76 		/* FALLTHRU */
77 	default:
78 		req->error_loc = offsetof(struct nvme_common_command, opcode);
79 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
80 	}
81 
82 	return status;
83 }
84 
85 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
86 		const char *subsysnqn);
87 
88 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
89 		size_t len)
90 {
91 	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
92 		req->error_loc = offsetof(struct nvme_common_command, dptr);
93 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
94 	}
95 	return 0;
96 }
97 
98 u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
99 {
100 	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
101 		req->error_loc = offsetof(struct nvme_common_command, dptr);
102 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
103 	}
104 	return 0;
105 }
106 
107 u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
108 {
109 	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
110 		req->error_loc = offsetof(struct nvme_common_command, dptr);
111 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
112 	}
113 	return 0;
114 }
115 
116 static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
117 {
118 	struct nvmet_ns *ns;
119 
120 	if (list_empty(&subsys->namespaces))
121 		return 0;
122 
123 	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
124 	return ns->nsid;
125 }
126 
127 static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
128 {
129 	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
130 }
131 
132 static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
133 {
134 	struct nvmet_async_event *aen;
135 	struct nvmet_req *req;
136 
137 	mutex_lock(&ctrl->lock);
138 	while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) {
139 		aen = list_first_entry(&ctrl->async_events,
140 				       struct nvmet_async_event, entry);
141 		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
142 		if (status == 0)
143 			nvmet_set_result(req, nvmet_async_event_result(aen));
144 
145 		list_del(&aen->entry);
146 		kfree(aen);
147 
148 		mutex_unlock(&ctrl->lock);
149 		trace_nvmet_async_event(ctrl, req->cqe->result.u32);
150 		nvmet_req_complete(req, status);
151 		mutex_lock(&ctrl->lock);
152 	}
153 	mutex_unlock(&ctrl->lock);
154 }
155 
156 static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
157 {
158 	struct nvmet_async_event *aen, *tmp;
159 
160 	mutex_lock(&ctrl->lock);
161 	list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) {
162 		list_del(&aen->entry);
163 		kfree(aen);
164 	}
165 	mutex_unlock(&ctrl->lock);
166 }
167 
168 static void nvmet_async_event_work(struct work_struct *work)
169 {
170 	struct nvmet_ctrl *ctrl =
171 		container_of(work, struct nvmet_ctrl, async_event_work);
172 
173 	nvmet_async_events_process(ctrl, 0);
174 }
175 
176 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
177 		u8 event_info, u8 log_page)
178 {
179 	struct nvmet_async_event *aen;
180 
181 	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
182 	if (!aen)
183 		return;
184 
185 	aen->event_type = event_type;
186 	aen->event_info = event_info;
187 	aen->log_page = log_page;
188 
189 	mutex_lock(&ctrl->lock);
190 	list_add_tail(&aen->entry, &ctrl->async_events);
191 	mutex_unlock(&ctrl->lock);
192 
193 	schedule_work(&ctrl->async_event_work);
194 }
195 
196 static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
197 {
198 	u32 i;
199 
200 	mutex_lock(&ctrl->lock);
201 	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
202 		goto out_unlock;
203 
204 	for (i = 0; i < ctrl->nr_changed_ns; i++) {
205 		if (ctrl->changed_ns_list[i] == nsid)
206 			goto out_unlock;
207 	}
208 
209 	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
210 		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
211 		ctrl->nr_changed_ns = U32_MAX;
212 		goto out_unlock;
213 	}
214 
215 	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
216 out_unlock:
217 	mutex_unlock(&ctrl->lock);
218 }
219 
220 void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
221 {
222 	struct nvmet_ctrl *ctrl;
223 
224 	lockdep_assert_held(&subsys->lock);
225 
226 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
227 		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
228 		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
229 			continue;
230 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
231 				NVME_AER_NOTICE_NS_CHANGED,
232 				NVME_LOG_CHANGED_NS);
233 	}
234 }
235 
236 void nvmet_send_ana_event(struct nvmet_subsys *subsys,
237 		struct nvmet_port *port)
238 {
239 	struct nvmet_ctrl *ctrl;
240 
241 	mutex_lock(&subsys->lock);
242 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
243 		if (port && ctrl->port != port)
244 			continue;
245 		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
246 			continue;
247 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
248 				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
249 	}
250 	mutex_unlock(&subsys->lock);
251 }
252 
253 void nvmet_port_send_ana_event(struct nvmet_port *port)
254 {
255 	struct nvmet_subsys_link *p;
256 
257 	down_read(&nvmet_config_sem);
258 	list_for_each_entry(p, &port->subsystems, entry)
259 		nvmet_send_ana_event(p->subsys, port);
260 	up_read(&nvmet_config_sem);
261 }
262 
263 int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
264 {
265 	int ret = 0;
266 
267 	down_write(&nvmet_config_sem);
268 	if (nvmet_transports[ops->type])
269 		ret = -EINVAL;
270 	else
271 		nvmet_transports[ops->type] = ops;
272 	up_write(&nvmet_config_sem);
273 
274 	return ret;
275 }
276 EXPORT_SYMBOL_GPL(nvmet_register_transport);
277 
278 void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
279 {
280 	down_write(&nvmet_config_sem);
281 	nvmet_transports[ops->type] = NULL;
282 	up_write(&nvmet_config_sem);
283 }
284 EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
285 
286 void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
287 {
288 	struct nvmet_ctrl *ctrl;
289 
290 	mutex_lock(&subsys->lock);
291 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
292 		if (ctrl->port == port)
293 			ctrl->ops->delete_ctrl(ctrl);
294 	}
295 	mutex_unlock(&subsys->lock);
296 }
297 
298 int nvmet_enable_port(struct nvmet_port *port)
299 {
300 	const struct nvmet_fabrics_ops *ops;
301 	int ret;
302 
303 	lockdep_assert_held(&nvmet_config_sem);
304 
305 	ops = nvmet_transports[port->disc_addr.trtype];
306 	if (!ops) {
307 		up_write(&nvmet_config_sem);
308 		request_module("nvmet-transport-%d", port->disc_addr.trtype);
309 		down_write(&nvmet_config_sem);
310 		ops = nvmet_transports[port->disc_addr.trtype];
311 		if (!ops) {
312 			pr_err("transport type %d not supported\n",
313 				port->disc_addr.trtype);
314 			return -EINVAL;
315 		}
316 	}
317 
318 	if (!try_module_get(ops->owner))
319 		return -EINVAL;
320 
321 	/*
322 	 * If the user requested PI support and the transport isn't pi capable,
323 	 * don't enable the port.
324 	 */
325 	if (port->pi_enable && !ops->metadata_support) {
326 		pr_err("T10-PI is not supported by transport type %d\n",
327 		       port->disc_addr.trtype);
328 		ret = -EINVAL;
329 		goto out_put;
330 	}
331 
332 	ret = ops->add_port(port);
333 	if (ret)
334 		goto out_put;
335 
336 	/* If the transport didn't set inline_data_size, then disable it. */
337 	if (port->inline_data_size < 0)
338 		port->inline_data_size = 0;
339 
340 	port->enabled = true;
341 	port->tr_ops = ops;
342 	return 0;
343 
344 out_put:
345 	module_put(ops->owner);
346 	return ret;
347 }
348 
349 void nvmet_disable_port(struct nvmet_port *port)
350 {
351 	const struct nvmet_fabrics_ops *ops;
352 
353 	lockdep_assert_held(&nvmet_config_sem);
354 
355 	port->enabled = false;
356 	port->tr_ops = NULL;
357 
358 	ops = nvmet_transports[port->disc_addr.trtype];
359 	ops->remove_port(port);
360 	module_put(ops->owner);
361 }
362 
363 static void nvmet_keep_alive_timer(struct work_struct *work)
364 {
365 	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
366 			struct nvmet_ctrl, ka_work);
367 	bool cmd_seen = ctrl->cmd_seen;
368 
369 	ctrl->cmd_seen = false;
370 	if (cmd_seen) {
371 		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
372 			ctrl->cntlid);
373 		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
374 		return;
375 	}
376 
377 	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
378 		ctrl->cntlid, ctrl->kato);
379 
380 	nvmet_ctrl_fatal_error(ctrl);
381 }
382 
383 static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
384 {
385 	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
386 		ctrl->cntlid, ctrl->kato);
387 
388 	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
389 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
390 }
391 
392 static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
393 {
394 	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
395 
396 	cancel_delayed_work_sync(&ctrl->ka_work);
397 }
398 
399 static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
400 		__le32 nsid)
401 {
402 	struct nvmet_ns *ns;
403 
404 	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
405 		if (ns->nsid == le32_to_cpu(nsid))
406 			return ns;
407 	}
408 
409 	return NULL;
410 }
411 
412 struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
413 {
414 	struct nvmet_ns *ns;
415 
416 	rcu_read_lock();
417 	ns = __nvmet_find_namespace(ctrl, nsid);
418 	if (ns)
419 		percpu_ref_get(&ns->ref);
420 	rcu_read_unlock();
421 
422 	return ns;
423 }
424 
425 static void nvmet_destroy_namespace(struct percpu_ref *ref)
426 {
427 	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
428 
429 	complete(&ns->disable_done);
430 }
431 
432 void nvmet_put_namespace(struct nvmet_ns *ns)
433 {
434 	percpu_ref_put(&ns->ref);
435 }
436 
437 static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
438 {
439 	nvmet_bdev_ns_disable(ns);
440 	nvmet_file_ns_disable(ns);
441 }
442 
443 static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
444 {
445 	int ret;
446 	struct pci_dev *p2p_dev;
447 
448 	if (!ns->use_p2pmem)
449 		return 0;
450 
451 	if (!ns->bdev) {
452 		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
453 		return -EINVAL;
454 	}
455 
456 	if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
457 		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
458 		       ns->device_path);
459 		return -EINVAL;
460 	}
461 
462 	if (ns->p2p_dev) {
463 		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
464 		if (ret < 0)
465 			return -EINVAL;
466 	} else {
467 		/*
468 		 * Right now we just check that there is p2pmem available so
469 		 * we can report an error to the user right away if there
470 		 * is not. We'll find the actual device to use once we
471 		 * setup the controller when the port's device is available.
472 		 */
473 
474 		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
475 		if (!p2p_dev) {
476 			pr_err("no peer-to-peer memory is available for %s\n",
477 			       ns->device_path);
478 			return -EINVAL;
479 		}
480 
481 		pci_dev_put(p2p_dev);
482 	}
483 
484 	return 0;
485 }
486 
487 /*
488  * Note: ctrl->subsys->lock should be held when calling this function
489  */
490 static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
491 				    struct nvmet_ns *ns)
492 {
493 	struct device *clients[2];
494 	struct pci_dev *p2p_dev;
495 	int ret;
496 
497 	if (!ctrl->p2p_client || !ns->use_p2pmem)
498 		return;
499 
500 	if (ns->p2p_dev) {
501 		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
502 		if (ret < 0)
503 			return;
504 
505 		p2p_dev = pci_dev_get(ns->p2p_dev);
506 	} else {
507 		clients[0] = ctrl->p2p_client;
508 		clients[1] = nvmet_ns_dev(ns);
509 
510 		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
511 		if (!p2p_dev) {
512 			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
513 			       dev_name(ctrl->p2p_client), ns->device_path);
514 			return;
515 		}
516 	}
517 
518 	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
519 	if (ret < 0)
520 		pci_dev_put(p2p_dev);
521 
522 	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
523 		ns->nsid);
524 }
525 
526 void nvmet_ns_revalidate(struct nvmet_ns *ns)
527 {
528 	loff_t oldsize = ns->size;
529 
530 	if (ns->bdev)
531 		nvmet_bdev_ns_revalidate(ns);
532 	else
533 		nvmet_file_ns_revalidate(ns);
534 
535 	if (oldsize != ns->size)
536 		nvmet_ns_changed(ns->subsys, ns->nsid);
537 }
538 
539 int nvmet_ns_enable(struct nvmet_ns *ns)
540 {
541 	struct nvmet_subsys *subsys = ns->subsys;
542 	struct nvmet_ctrl *ctrl;
543 	int ret;
544 
545 	mutex_lock(&subsys->lock);
546 	ret = 0;
547 	if (ns->enabled)
548 		goto out_unlock;
549 
550 	ret = -EMFILE;
551 	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
552 		goto out_unlock;
553 
554 	ret = nvmet_bdev_ns_enable(ns);
555 	if (ret == -ENOTBLK)
556 		ret = nvmet_file_ns_enable(ns);
557 	if (ret)
558 		goto out_unlock;
559 
560 	ret = nvmet_p2pmem_ns_enable(ns);
561 	if (ret)
562 		goto out_dev_disable;
563 
564 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
565 		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
566 
567 	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
568 				0, GFP_KERNEL);
569 	if (ret)
570 		goto out_dev_put;
571 
572 	if (ns->nsid > subsys->max_nsid)
573 		subsys->max_nsid = ns->nsid;
574 
575 	/*
576 	 * The namespaces list needs to be sorted to simplify the implementation
577 	 * of the Identify Namepace List subcommand.
578 	 */
579 	if (list_empty(&subsys->namespaces)) {
580 		list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
581 	} else {
582 		struct nvmet_ns *old;
583 
584 		list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
585 					lockdep_is_held(&subsys->lock)) {
586 			BUG_ON(ns->nsid == old->nsid);
587 			if (ns->nsid < old->nsid)
588 				break;
589 		}
590 
591 		list_add_tail_rcu(&ns->dev_link, &old->dev_link);
592 	}
593 	subsys->nr_namespaces++;
594 
595 	nvmet_ns_changed(subsys, ns->nsid);
596 	ns->enabled = true;
597 	ret = 0;
598 out_unlock:
599 	mutex_unlock(&subsys->lock);
600 	return ret;
601 out_dev_put:
602 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
603 		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
604 out_dev_disable:
605 	nvmet_ns_dev_disable(ns);
606 	goto out_unlock;
607 }
608 
609 void nvmet_ns_disable(struct nvmet_ns *ns)
610 {
611 	struct nvmet_subsys *subsys = ns->subsys;
612 	struct nvmet_ctrl *ctrl;
613 
614 	mutex_lock(&subsys->lock);
615 	if (!ns->enabled)
616 		goto out_unlock;
617 
618 	ns->enabled = false;
619 	list_del_rcu(&ns->dev_link);
620 	if (ns->nsid == subsys->max_nsid)
621 		subsys->max_nsid = nvmet_max_nsid(subsys);
622 
623 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
624 		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
625 
626 	mutex_unlock(&subsys->lock);
627 
628 	/*
629 	 * Now that we removed the namespaces from the lookup list, we
630 	 * can kill the per_cpu ref and wait for any remaining references
631 	 * to be dropped, as well as a RCU grace period for anyone only
632 	 * using the namepace under rcu_read_lock().  Note that we can't
633 	 * use call_rcu here as we need to ensure the namespaces have
634 	 * been fully destroyed before unloading the module.
635 	 */
636 	percpu_ref_kill(&ns->ref);
637 	synchronize_rcu();
638 	wait_for_completion(&ns->disable_done);
639 	percpu_ref_exit(&ns->ref);
640 
641 	mutex_lock(&subsys->lock);
642 
643 	subsys->nr_namespaces--;
644 	nvmet_ns_changed(subsys, ns->nsid);
645 	nvmet_ns_dev_disable(ns);
646 out_unlock:
647 	mutex_unlock(&subsys->lock);
648 }
649 
650 void nvmet_ns_free(struct nvmet_ns *ns)
651 {
652 	nvmet_ns_disable(ns);
653 
654 	down_write(&nvmet_ana_sem);
655 	nvmet_ana_group_enabled[ns->anagrpid]--;
656 	up_write(&nvmet_ana_sem);
657 
658 	kfree(ns->device_path);
659 	kfree(ns);
660 }
661 
662 struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
663 {
664 	struct nvmet_ns *ns;
665 
666 	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
667 	if (!ns)
668 		return NULL;
669 
670 	INIT_LIST_HEAD(&ns->dev_link);
671 	init_completion(&ns->disable_done);
672 
673 	ns->nsid = nsid;
674 	ns->subsys = subsys;
675 
676 	down_write(&nvmet_ana_sem);
677 	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
678 	nvmet_ana_group_enabled[ns->anagrpid]++;
679 	up_write(&nvmet_ana_sem);
680 
681 	uuid_gen(&ns->uuid);
682 	ns->buffered_io = false;
683 
684 	return ns;
685 }
686 
687 static void nvmet_update_sq_head(struct nvmet_req *req)
688 {
689 	if (req->sq->size) {
690 		u32 old_sqhd, new_sqhd;
691 
692 		do {
693 			old_sqhd = req->sq->sqhd;
694 			new_sqhd = (old_sqhd + 1) % req->sq->size;
695 		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
696 					old_sqhd);
697 	}
698 	req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
699 }
700 
701 static void nvmet_set_error(struct nvmet_req *req, u16 status)
702 {
703 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
704 	struct nvme_error_slot *new_error_slot;
705 	unsigned long flags;
706 
707 	req->cqe->status = cpu_to_le16(status << 1);
708 
709 	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
710 		return;
711 
712 	spin_lock_irqsave(&ctrl->error_lock, flags);
713 	ctrl->err_counter++;
714 	new_error_slot =
715 		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
716 
717 	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
718 	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
719 	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
720 	new_error_slot->status_field = cpu_to_le16(status << 1);
721 	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
722 	new_error_slot->lba = cpu_to_le64(req->error_slba);
723 	new_error_slot->nsid = req->cmd->common.nsid;
724 	spin_unlock_irqrestore(&ctrl->error_lock, flags);
725 
726 	/* set the more bit for this request */
727 	req->cqe->status |= cpu_to_le16(1 << 14);
728 }
729 
730 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
731 {
732 	if (!req->sq->sqhd_disabled)
733 		nvmet_update_sq_head(req);
734 	req->cqe->sq_id = cpu_to_le16(req->sq->qid);
735 	req->cqe->command_id = req->cmd->common.command_id;
736 
737 	if (unlikely(status))
738 		nvmet_set_error(req, status);
739 
740 	trace_nvmet_req_complete(req);
741 
742 	if (req->ns)
743 		nvmet_put_namespace(req->ns);
744 	req->ops->queue_response(req);
745 }
746 
747 void nvmet_req_complete(struct nvmet_req *req, u16 status)
748 {
749 	__nvmet_req_complete(req, status);
750 	percpu_ref_put(&req->sq->ref);
751 }
752 EXPORT_SYMBOL_GPL(nvmet_req_complete);
753 
754 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
755 		u16 qid, u16 size)
756 {
757 	cq->qid = qid;
758 	cq->size = size;
759 
760 	ctrl->cqs[qid] = cq;
761 }
762 
763 void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
764 		u16 qid, u16 size)
765 {
766 	sq->sqhd = 0;
767 	sq->qid = qid;
768 	sq->size = size;
769 
770 	ctrl->sqs[qid] = sq;
771 }
772 
773 static void nvmet_confirm_sq(struct percpu_ref *ref)
774 {
775 	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
776 
777 	complete(&sq->confirm_done);
778 }
779 
780 void nvmet_sq_destroy(struct nvmet_sq *sq)
781 {
782 	u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
783 	struct nvmet_ctrl *ctrl = sq->ctrl;
784 
785 	/*
786 	 * If this is the admin queue, complete all AERs so that our
787 	 * queue doesn't have outstanding requests on it.
788 	 */
789 	if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
790 		nvmet_async_events_process(ctrl, status);
791 	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
792 	wait_for_completion(&sq->confirm_done);
793 	wait_for_completion(&sq->free_done);
794 	percpu_ref_exit(&sq->ref);
795 
796 	if (ctrl) {
797 		nvmet_ctrl_put(ctrl);
798 		sq->ctrl = NULL; /* allows reusing the queue later */
799 	}
800 }
801 EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
802 
803 static void nvmet_sq_free(struct percpu_ref *ref)
804 {
805 	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
806 
807 	complete(&sq->free_done);
808 }
809 
810 int nvmet_sq_init(struct nvmet_sq *sq)
811 {
812 	int ret;
813 
814 	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
815 	if (ret) {
816 		pr_err("percpu_ref init failed!\n");
817 		return ret;
818 	}
819 	init_completion(&sq->free_done);
820 	init_completion(&sq->confirm_done);
821 
822 	return 0;
823 }
824 EXPORT_SYMBOL_GPL(nvmet_sq_init);
825 
826 static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
827 		struct nvmet_ns *ns)
828 {
829 	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
830 
831 	if (unlikely(state == NVME_ANA_INACCESSIBLE))
832 		return NVME_SC_ANA_INACCESSIBLE;
833 	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
834 		return NVME_SC_ANA_PERSISTENT_LOSS;
835 	if (unlikely(state == NVME_ANA_CHANGE))
836 		return NVME_SC_ANA_TRANSITION;
837 	return 0;
838 }
839 
840 static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
841 {
842 	if (unlikely(req->ns->readonly)) {
843 		switch (req->cmd->common.opcode) {
844 		case nvme_cmd_read:
845 		case nvme_cmd_flush:
846 			break;
847 		default:
848 			return NVME_SC_NS_WRITE_PROTECTED;
849 		}
850 	}
851 
852 	return 0;
853 }
854 
855 static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
856 {
857 	struct nvme_command *cmd = req->cmd;
858 	u16 ret;
859 
860 	ret = nvmet_check_ctrl_status(req, cmd);
861 	if (unlikely(ret))
862 		return ret;
863 
864 	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
865 	if (unlikely(!req->ns)) {
866 		req->error_loc = offsetof(struct nvme_common_command, nsid);
867 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
868 	}
869 	ret = nvmet_check_ana_state(req->port, req->ns);
870 	if (unlikely(ret)) {
871 		req->error_loc = offsetof(struct nvme_common_command, nsid);
872 		return ret;
873 	}
874 	ret = nvmet_io_cmd_check_access(req);
875 	if (unlikely(ret)) {
876 		req->error_loc = offsetof(struct nvme_common_command, nsid);
877 		return ret;
878 	}
879 
880 	if (req->ns->file)
881 		return nvmet_file_parse_io_cmd(req);
882 	else
883 		return nvmet_bdev_parse_io_cmd(req);
884 }
885 
886 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
887 		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
888 {
889 	u8 flags = req->cmd->common.flags;
890 	u16 status;
891 
892 	req->cq = cq;
893 	req->sq = sq;
894 	req->ops = ops;
895 	req->sg = NULL;
896 	req->metadata_sg = NULL;
897 	req->sg_cnt = 0;
898 	req->metadata_sg_cnt = 0;
899 	req->transfer_len = 0;
900 	req->metadata_len = 0;
901 	req->cqe->status = 0;
902 	req->cqe->sq_head = 0;
903 	req->ns = NULL;
904 	req->error_loc = NVMET_NO_ERROR_LOC;
905 	req->error_slba = 0;
906 
907 	trace_nvmet_req_init(req, req->cmd);
908 
909 	/* no support for fused commands yet */
910 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
911 		req->error_loc = offsetof(struct nvme_common_command, flags);
912 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
913 		goto fail;
914 	}
915 
916 	/*
917 	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
918 	 * contains an address of a single contiguous physical buffer that is
919 	 * byte aligned.
920 	 */
921 	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
922 		req->error_loc = offsetof(struct nvme_common_command, flags);
923 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
924 		goto fail;
925 	}
926 
927 	if (unlikely(!req->sq->ctrl))
928 		/* will return an error for any non-connect command: */
929 		status = nvmet_parse_connect_cmd(req);
930 	else if (likely(req->sq->qid != 0))
931 		status = nvmet_parse_io_cmd(req);
932 	else
933 		status = nvmet_parse_admin_cmd(req);
934 
935 	if (status)
936 		goto fail;
937 
938 	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
939 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
940 		goto fail;
941 	}
942 
943 	if (sq->ctrl)
944 		sq->ctrl->cmd_seen = true;
945 
946 	return true;
947 
948 fail:
949 	__nvmet_req_complete(req, status);
950 	return false;
951 }
952 EXPORT_SYMBOL_GPL(nvmet_req_init);
953 
954 void nvmet_req_uninit(struct nvmet_req *req)
955 {
956 	percpu_ref_put(&req->sq->ref);
957 	if (req->ns)
958 		nvmet_put_namespace(req->ns);
959 }
960 EXPORT_SYMBOL_GPL(nvmet_req_uninit);
961 
962 bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
963 {
964 	if (unlikely(len != req->transfer_len)) {
965 		req->error_loc = offsetof(struct nvme_common_command, dptr);
966 		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
967 		return false;
968 	}
969 
970 	return true;
971 }
972 EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
973 
974 bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
975 {
976 	if (unlikely(data_len > req->transfer_len)) {
977 		req->error_loc = offsetof(struct nvme_common_command, dptr);
978 		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
979 		return false;
980 	}
981 
982 	return true;
983 }
984 
985 static unsigned int nvmet_data_transfer_len(struct nvmet_req *req)
986 {
987 	return req->transfer_len - req->metadata_len;
988 }
989 
990 static int nvmet_req_alloc_p2pmem_sgls(struct nvmet_req *req)
991 {
992 	req->sg = pci_p2pmem_alloc_sgl(req->p2p_dev, &req->sg_cnt,
993 			nvmet_data_transfer_len(req));
994 	if (!req->sg)
995 		goto out_err;
996 
997 	if (req->metadata_len) {
998 		req->metadata_sg = pci_p2pmem_alloc_sgl(req->p2p_dev,
999 				&req->metadata_sg_cnt, req->metadata_len);
1000 		if (!req->metadata_sg)
1001 			goto out_free_sg;
1002 	}
1003 	return 0;
1004 out_free_sg:
1005 	pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1006 out_err:
1007 	return -ENOMEM;
1008 }
1009 
1010 static bool nvmet_req_find_p2p_dev(struct nvmet_req *req)
1011 {
1012 	if (!IS_ENABLED(CONFIG_PCI_P2PDMA))
1013 		return false;
1014 
1015 	if (req->sq->ctrl && req->sq->qid && req->ns) {
1016 		req->p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
1017 						 req->ns->nsid);
1018 		if (req->p2p_dev)
1019 			return true;
1020 	}
1021 
1022 	req->p2p_dev = NULL;
1023 	return false;
1024 }
1025 
1026 int nvmet_req_alloc_sgls(struct nvmet_req *req)
1027 {
1028 	if (nvmet_req_find_p2p_dev(req) && !nvmet_req_alloc_p2pmem_sgls(req))
1029 		return 0;
1030 
1031 	req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL,
1032 			    &req->sg_cnt);
1033 	if (unlikely(!req->sg))
1034 		goto out;
1035 
1036 	if (req->metadata_len) {
1037 		req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL,
1038 					     &req->metadata_sg_cnt);
1039 		if (unlikely(!req->metadata_sg))
1040 			goto out_free;
1041 	}
1042 
1043 	return 0;
1044 out_free:
1045 	sgl_free(req->sg);
1046 out:
1047 	return -ENOMEM;
1048 }
1049 EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls);
1050 
1051 void nvmet_req_free_sgls(struct nvmet_req *req)
1052 {
1053 	if (req->p2p_dev) {
1054 		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1055 		if (req->metadata_sg)
1056 			pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg);
1057 	} else {
1058 		sgl_free(req->sg);
1059 		if (req->metadata_sg)
1060 			sgl_free(req->metadata_sg);
1061 	}
1062 
1063 	req->sg = NULL;
1064 	req->metadata_sg = NULL;
1065 	req->sg_cnt = 0;
1066 	req->metadata_sg_cnt = 0;
1067 }
1068 EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
1069 
1070 static inline bool nvmet_cc_en(u32 cc)
1071 {
1072 	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
1073 }
1074 
1075 static inline u8 nvmet_cc_css(u32 cc)
1076 {
1077 	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
1078 }
1079 
1080 static inline u8 nvmet_cc_mps(u32 cc)
1081 {
1082 	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
1083 }
1084 
1085 static inline u8 nvmet_cc_ams(u32 cc)
1086 {
1087 	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
1088 }
1089 
1090 static inline u8 nvmet_cc_shn(u32 cc)
1091 {
1092 	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
1093 }
1094 
1095 static inline u8 nvmet_cc_iosqes(u32 cc)
1096 {
1097 	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
1098 }
1099 
1100 static inline u8 nvmet_cc_iocqes(u32 cc)
1101 {
1102 	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1103 }
1104 
1105 static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1106 {
1107 	lockdep_assert_held(&ctrl->lock);
1108 
1109 	if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1110 	    nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
1111 	    nvmet_cc_mps(ctrl->cc) != 0 ||
1112 	    nvmet_cc_ams(ctrl->cc) != 0 ||
1113 	    nvmet_cc_css(ctrl->cc) != 0) {
1114 		ctrl->csts = NVME_CSTS_CFS;
1115 		return;
1116 	}
1117 
1118 	ctrl->csts = NVME_CSTS_RDY;
1119 
1120 	/*
1121 	 * Controllers that are not yet enabled should not really enforce the
1122 	 * keep alive timeout, but we still want to track a timeout and cleanup
1123 	 * in case a host died before it enabled the controller.  Hence, simply
1124 	 * reset the keep alive timer when the controller is enabled.
1125 	 */
1126 	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1127 }
1128 
1129 static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1130 {
1131 	lockdep_assert_held(&ctrl->lock);
1132 
1133 	/* XXX: tear down queues? */
1134 	ctrl->csts &= ~NVME_CSTS_RDY;
1135 	ctrl->cc = 0;
1136 }
1137 
1138 void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1139 {
1140 	u32 old;
1141 
1142 	mutex_lock(&ctrl->lock);
1143 	old = ctrl->cc;
1144 	ctrl->cc = new;
1145 
1146 	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1147 		nvmet_start_ctrl(ctrl);
1148 	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1149 		nvmet_clear_ctrl(ctrl);
1150 	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1151 		nvmet_clear_ctrl(ctrl);
1152 		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1153 	}
1154 	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1155 		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1156 	mutex_unlock(&ctrl->lock);
1157 }
1158 
1159 static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1160 {
1161 	/* command sets supported: NVMe command set: */
1162 	ctrl->cap = (1ULL << 37);
1163 	/* CC.EN timeout in 500msec units: */
1164 	ctrl->cap |= (15ULL << 24);
1165 	/* maximum queue entries supported: */
1166 	ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1167 }
1168 
1169 u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1170 		struct nvmet_req *req, struct nvmet_ctrl **ret)
1171 {
1172 	struct nvmet_subsys *subsys;
1173 	struct nvmet_ctrl *ctrl;
1174 	u16 status = 0;
1175 
1176 	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1177 	if (!subsys) {
1178 		pr_warn("connect request for invalid subsystem %s!\n",
1179 			subsysnqn);
1180 		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1181 		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1182 	}
1183 
1184 	mutex_lock(&subsys->lock);
1185 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1186 		if (ctrl->cntlid == cntlid) {
1187 			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1188 				pr_warn("hostnqn mismatch.\n");
1189 				continue;
1190 			}
1191 			if (!kref_get_unless_zero(&ctrl->ref))
1192 				continue;
1193 
1194 			*ret = ctrl;
1195 			goto out;
1196 		}
1197 	}
1198 
1199 	pr_warn("could not find controller %d for subsys %s / host %s\n",
1200 		cntlid, subsysnqn, hostnqn);
1201 	req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1202 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1203 
1204 out:
1205 	mutex_unlock(&subsys->lock);
1206 	nvmet_subsys_put(subsys);
1207 	return status;
1208 }
1209 
1210 u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1211 {
1212 	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1213 		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1214 		       cmd->common.opcode, req->sq->qid);
1215 		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1216 	}
1217 
1218 	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1219 		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1220 		       cmd->common.opcode, req->sq->qid);
1221 		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1222 	}
1223 	return 0;
1224 }
1225 
1226 bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1227 {
1228 	struct nvmet_host_link *p;
1229 
1230 	lockdep_assert_held(&nvmet_config_sem);
1231 
1232 	if (subsys->allow_any_host)
1233 		return true;
1234 
1235 	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1236 		return true;
1237 
1238 	list_for_each_entry(p, &subsys->hosts, entry) {
1239 		if (!strcmp(nvmet_host_name(p->host), hostnqn))
1240 			return true;
1241 	}
1242 
1243 	return false;
1244 }
1245 
1246 /*
1247  * Note: ctrl->subsys->lock should be held when calling this function
1248  */
1249 static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1250 		struct nvmet_req *req)
1251 {
1252 	struct nvmet_ns *ns;
1253 
1254 	if (!req->p2p_client)
1255 		return;
1256 
1257 	ctrl->p2p_client = get_device(req->p2p_client);
1258 
1259 	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
1260 				lockdep_is_held(&ctrl->subsys->lock))
1261 		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1262 }
1263 
1264 /*
1265  * Note: ctrl->subsys->lock should be held when calling this function
1266  */
1267 static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1268 {
1269 	struct radix_tree_iter iter;
1270 	void __rcu **slot;
1271 
1272 	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1273 		pci_dev_put(radix_tree_deref_slot(slot));
1274 
1275 	put_device(ctrl->p2p_client);
1276 }
1277 
1278 static void nvmet_fatal_error_handler(struct work_struct *work)
1279 {
1280 	struct nvmet_ctrl *ctrl =
1281 			container_of(work, struct nvmet_ctrl, fatal_err_work);
1282 
1283 	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1284 	ctrl->ops->delete_ctrl(ctrl);
1285 }
1286 
1287 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1288 		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1289 {
1290 	struct nvmet_subsys *subsys;
1291 	struct nvmet_ctrl *ctrl;
1292 	int ret;
1293 	u16 status;
1294 
1295 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1296 	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1297 	if (!subsys) {
1298 		pr_warn("connect request for invalid subsystem %s!\n",
1299 			subsysnqn);
1300 		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1301 		goto out;
1302 	}
1303 
1304 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1305 	down_read(&nvmet_config_sem);
1306 	if (!nvmet_host_allowed(subsys, hostnqn)) {
1307 		pr_info("connect by host %s for subsystem %s not allowed\n",
1308 			hostnqn, subsysnqn);
1309 		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1310 		up_read(&nvmet_config_sem);
1311 		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1312 		goto out_put_subsystem;
1313 	}
1314 	up_read(&nvmet_config_sem);
1315 
1316 	status = NVME_SC_INTERNAL;
1317 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1318 	if (!ctrl)
1319 		goto out_put_subsystem;
1320 	mutex_init(&ctrl->lock);
1321 
1322 	nvmet_init_cap(ctrl);
1323 
1324 	ctrl->port = req->port;
1325 
1326 	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1327 	INIT_LIST_HEAD(&ctrl->async_events);
1328 	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1329 	INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1330 
1331 	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1332 	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1333 
1334 	kref_init(&ctrl->ref);
1335 	ctrl->subsys = subsys;
1336 	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1337 
1338 	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1339 			sizeof(__le32), GFP_KERNEL);
1340 	if (!ctrl->changed_ns_list)
1341 		goto out_free_ctrl;
1342 
1343 	ctrl->cqs = kcalloc(subsys->max_qid + 1,
1344 			sizeof(struct nvmet_cq *),
1345 			GFP_KERNEL);
1346 	if (!ctrl->cqs)
1347 		goto out_free_changed_ns_list;
1348 
1349 	ctrl->sqs = kcalloc(subsys->max_qid + 1,
1350 			sizeof(struct nvmet_sq *),
1351 			GFP_KERNEL);
1352 	if (!ctrl->sqs)
1353 		goto out_free_cqs;
1354 
1355 	if (subsys->cntlid_min > subsys->cntlid_max)
1356 		goto out_free_cqs;
1357 
1358 	ret = ida_simple_get(&cntlid_ida,
1359 			     subsys->cntlid_min, subsys->cntlid_max,
1360 			     GFP_KERNEL);
1361 	if (ret < 0) {
1362 		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1363 		goto out_free_sqs;
1364 	}
1365 	ctrl->cntlid = ret;
1366 
1367 	ctrl->ops = req->ops;
1368 
1369 	/*
1370 	 * Discovery controllers may use some arbitrary high value
1371 	 * in order to cleanup stale discovery sessions
1372 	 */
1373 	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1374 		kato = NVMET_DISC_KATO_MS;
1375 
1376 	/* keep-alive timeout in seconds */
1377 	ctrl->kato = DIV_ROUND_UP(kato, 1000);
1378 
1379 	ctrl->err_counter = 0;
1380 	spin_lock_init(&ctrl->error_lock);
1381 
1382 	nvmet_start_keep_alive_timer(ctrl);
1383 
1384 	mutex_lock(&subsys->lock);
1385 	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1386 	nvmet_setup_p2p_ns_map(ctrl, req);
1387 	mutex_unlock(&subsys->lock);
1388 
1389 	*ctrlp = ctrl;
1390 	return 0;
1391 
1392 out_free_sqs:
1393 	kfree(ctrl->sqs);
1394 out_free_cqs:
1395 	kfree(ctrl->cqs);
1396 out_free_changed_ns_list:
1397 	kfree(ctrl->changed_ns_list);
1398 out_free_ctrl:
1399 	kfree(ctrl);
1400 out_put_subsystem:
1401 	nvmet_subsys_put(subsys);
1402 out:
1403 	return status;
1404 }
1405 
1406 static void nvmet_ctrl_free(struct kref *ref)
1407 {
1408 	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1409 	struct nvmet_subsys *subsys = ctrl->subsys;
1410 
1411 	mutex_lock(&subsys->lock);
1412 	nvmet_release_p2p_ns_map(ctrl);
1413 	list_del(&ctrl->subsys_entry);
1414 	mutex_unlock(&subsys->lock);
1415 
1416 	nvmet_stop_keep_alive_timer(ctrl);
1417 
1418 	flush_work(&ctrl->async_event_work);
1419 	cancel_work_sync(&ctrl->fatal_err_work);
1420 
1421 	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1422 
1423 	nvmet_async_events_free(ctrl);
1424 	kfree(ctrl->sqs);
1425 	kfree(ctrl->cqs);
1426 	kfree(ctrl->changed_ns_list);
1427 	kfree(ctrl);
1428 
1429 	nvmet_subsys_put(subsys);
1430 }
1431 
1432 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1433 {
1434 	kref_put(&ctrl->ref, nvmet_ctrl_free);
1435 }
1436 
1437 void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1438 {
1439 	mutex_lock(&ctrl->lock);
1440 	if (!(ctrl->csts & NVME_CSTS_CFS)) {
1441 		ctrl->csts |= NVME_CSTS_CFS;
1442 		schedule_work(&ctrl->fatal_err_work);
1443 	}
1444 	mutex_unlock(&ctrl->lock);
1445 }
1446 EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1447 
1448 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1449 		const char *subsysnqn)
1450 {
1451 	struct nvmet_subsys_link *p;
1452 
1453 	if (!port)
1454 		return NULL;
1455 
1456 	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1457 		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1458 			return NULL;
1459 		return nvmet_disc_subsys;
1460 	}
1461 
1462 	down_read(&nvmet_config_sem);
1463 	list_for_each_entry(p, &port->subsystems, entry) {
1464 		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1465 				NVMF_NQN_SIZE)) {
1466 			if (!kref_get_unless_zero(&p->subsys->ref))
1467 				break;
1468 			up_read(&nvmet_config_sem);
1469 			return p->subsys;
1470 		}
1471 	}
1472 	up_read(&nvmet_config_sem);
1473 	return NULL;
1474 }
1475 
1476 struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1477 		enum nvme_subsys_type type)
1478 {
1479 	struct nvmet_subsys *subsys;
1480 
1481 	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1482 	if (!subsys)
1483 		return ERR_PTR(-ENOMEM);
1484 
1485 	subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
1486 	/* generate a random serial number as our controllers are ephemeral: */
1487 	get_random_bytes(&subsys->serial, sizeof(subsys->serial));
1488 
1489 	switch (type) {
1490 	case NVME_NQN_NVME:
1491 		subsys->max_qid = NVMET_NR_QUEUES;
1492 		break;
1493 	case NVME_NQN_DISC:
1494 		subsys->max_qid = 0;
1495 		break;
1496 	default:
1497 		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1498 		kfree(subsys);
1499 		return ERR_PTR(-EINVAL);
1500 	}
1501 	subsys->type = type;
1502 	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1503 			GFP_KERNEL);
1504 	if (!subsys->subsysnqn) {
1505 		kfree(subsys);
1506 		return ERR_PTR(-ENOMEM);
1507 	}
1508 	subsys->cntlid_min = NVME_CNTLID_MIN;
1509 	subsys->cntlid_max = NVME_CNTLID_MAX;
1510 	kref_init(&subsys->ref);
1511 
1512 	mutex_init(&subsys->lock);
1513 	INIT_LIST_HEAD(&subsys->namespaces);
1514 	INIT_LIST_HEAD(&subsys->ctrls);
1515 	INIT_LIST_HEAD(&subsys->hosts);
1516 
1517 	return subsys;
1518 }
1519 
1520 static void nvmet_subsys_free(struct kref *ref)
1521 {
1522 	struct nvmet_subsys *subsys =
1523 		container_of(ref, struct nvmet_subsys, ref);
1524 
1525 	WARN_ON_ONCE(!list_empty(&subsys->namespaces));
1526 
1527 	kfree(subsys->subsysnqn);
1528 	kfree_rcu(subsys->model, rcuhead);
1529 	kfree(subsys);
1530 }
1531 
1532 void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1533 {
1534 	struct nvmet_ctrl *ctrl;
1535 
1536 	mutex_lock(&subsys->lock);
1537 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1538 		ctrl->ops->delete_ctrl(ctrl);
1539 	mutex_unlock(&subsys->lock);
1540 }
1541 
1542 void nvmet_subsys_put(struct nvmet_subsys *subsys)
1543 {
1544 	kref_put(&subsys->ref, nvmet_subsys_free);
1545 }
1546 
1547 static int __init nvmet_init(void)
1548 {
1549 	int error;
1550 
1551 	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1552 
1553 	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1554 			WQ_MEM_RECLAIM, 0);
1555 	if (!buffered_io_wq) {
1556 		error = -ENOMEM;
1557 		goto out;
1558 	}
1559 
1560 	error = nvmet_init_discovery();
1561 	if (error)
1562 		goto out_free_work_queue;
1563 
1564 	error = nvmet_init_configfs();
1565 	if (error)
1566 		goto out_exit_discovery;
1567 	return 0;
1568 
1569 out_exit_discovery:
1570 	nvmet_exit_discovery();
1571 out_free_work_queue:
1572 	destroy_workqueue(buffered_io_wq);
1573 out:
1574 	return error;
1575 }
1576 
1577 static void __exit nvmet_exit(void)
1578 {
1579 	nvmet_exit_configfs();
1580 	nvmet_exit_discovery();
1581 	ida_destroy(&cntlid_ida);
1582 	destroy_workqueue(buffered_io_wq);
1583 
1584 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1585 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1586 }
1587 
1588 module_init(nvmet_init);
1589 module_exit(nvmet_exit);
1590 
1591 MODULE_LICENSE("GPL v2");
1592