1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/kernel.h>
10 #include <sys/limits.h>
11 #include <sys/lock.h>
12 #include <sys/malloc.h>
13 #include <sys/mbuf.h>
14 #include <sys/module.h>
15 #include <sys/nv.h>
16 #include <sys/refcount.h>
17 #include <sys/sysctl.h>
18 #include <sys/sx.h>
19 #include <dev/nvme/nvme.h>
20 #include <dev/nvmf/nvmf.h>
21 #include <dev/nvmf/nvmf_transport.h>
22 #include <dev/nvmf/nvmf_transport_internal.h>
23
24 /* Transport-independent support for fabrics queue pairs and commands. */
25
26 struct nvmf_transport {
27 struct nvmf_transport_ops *nt_ops;
28
29 volatile u_int nt_active_qpairs;
30 SLIST_ENTRY(nvmf_transport) nt_link;
31 };
32
33 /* nvmf_transports[nvmf_trtype] is sorted by priority */
34 static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1];
35 static struct sx nvmf_transports_lock;
36
37 static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport",
38 "NVMe over Fabrics transport");
39
40 SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
41 "NVMe over Fabrics");
42
43 static bool
nvmf_supported_trtype(enum nvmf_trtype trtype)44 nvmf_supported_trtype(enum nvmf_trtype trtype)
45 {
46 return (trtype < nitems(nvmf_transports));
47 }
48
49 struct nvmf_qpair *
nvmf_allocate_qpair(enum nvmf_trtype trtype,bool controller,const nvlist_t * params,nvmf_qpair_error_t * error_cb,void * error_cb_arg,nvmf_capsule_receive_t * receive_cb,void * receive_cb_arg)50 nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller,
51 const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg,
52 nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg)
53 {
54 struct nvmf_transport *nt;
55 struct nvmf_qpair *qp;
56
57 if (!nvmf_supported_trtype(trtype))
58 return (NULL);
59
60 sx_slock(&nvmf_transports_lock);
61 SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) {
62 qp = nt->nt_ops->allocate_qpair(controller, params);
63 if (qp != NULL) {
64 refcount_acquire(&nt->nt_active_qpairs);
65 break;
66 }
67 }
68 sx_sunlock(&nvmf_transports_lock);
69 if (qp == NULL)
70 return (NULL);
71
72 qp->nq_transport = nt;
73 qp->nq_ops = nt->nt_ops;
74 qp->nq_controller = controller;
75 qp->nq_error = error_cb;
76 qp->nq_error_arg = error_cb_arg;
77 qp->nq_receive = receive_cb;
78 qp->nq_receive_arg = receive_cb_arg;
79 qp->nq_admin = nvlist_get_bool(params, "admin");
80 return (qp);
81 }
82
83 void
nvmf_free_qpair(struct nvmf_qpair * qp)84 nvmf_free_qpair(struct nvmf_qpair *qp)
85 {
86 struct nvmf_transport *nt;
87
88 nt = qp->nq_transport;
89 qp->nq_ops->free_qpair(qp);
90 if (refcount_release(&nt->nt_active_qpairs))
91 wakeup(nt);
92 }
93
94 struct nvmf_capsule *
nvmf_allocate_command(struct nvmf_qpair * qp,const void * sqe,int how)95 nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how)
96 {
97 struct nvmf_capsule *nc;
98
99 KASSERT(how == M_WAITOK || how == M_NOWAIT,
100 ("%s: invalid how", __func__));
101 nc = qp->nq_ops->allocate_capsule(qp, how);
102 if (nc == NULL)
103 return (NULL);
104
105 nc->nc_qpair = qp;
106 nc->nc_qe_len = sizeof(struct nvme_command);
107 memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
108
109 /* 4.2 of NVMe base spec: Fabrics always uses SGL. */
110 nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
111 nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
112 return (nc);
113 }
114
115 struct nvmf_capsule *
nvmf_allocate_response(struct nvmf_qpair * qp,const void * cqe,int how)116 nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how)
117 {
118 struct nvmf_capsule *nc;
119
120 KASSERT(how == M_WAITOK || how == M_NOWAIT,
121 ("%s: invalid how", __func__));
122 nc = qp->nq_ops->allocate_capsule(qp, how);
123 if (nc == NULL)
124 return (NULL);
125
126 nc->nc_qpair = qp;
127 nc->nc_qe_len = sizeof(struct nvme_completion);
128 memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
129 return (nc);
130 }
131
132 int
nvmf_capsule_append_data(struct nvmf_capsule * nc,struct memdesc * mem,size_t len,bool send,nvmf_io_complete_t * complete_cb,void * cb_arg)133 nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem,
134 size_t len, bool send, nvmf_io_complete_t *complete_cb,
135 void *cb_arg)
136 {
137 if (nc->nc_data.io_len != 0)
138 return (EBUSY);
139
140 nc->nc_send_data = send;
141 nc->nc_data.io_mem = *mem;
142 nc->nc_data.io_len = len;
143 nc->nc_data.io_complete = complete_cb;
144 nc->nc_data.io_complete_arg = cb_arg;
145 return (0);
146 }
147
148 void
nvmf_free_capsule(struct nvmf_capsule * nc)149 nvmf_free_capsule(struct nvmf_capsule *nc)
150 {
151 nc->nc_qpair->nq_ops->free_capsule(nc);
152 }
153
154 int
nvmf_transmit_capsule(struct nvmf_capsule * nc)155 nvmf_transmit_capsule(struct nvmf_capsule *nc)
156 {
157 return (nc->nc_qpair->nq_ops->transmit_capsule(nc));
158 }
159
160 void
nvmf_abort_capsule_data(struct nvmf_capsule * nc,int error)161 nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error)
162 {
163 if (nc->nc_data.io_len != 0)
164 nvmf_complete_io_request(&nc->nc_data, 0, error);
165 }
166
167 void *
nvmf_capsule_sqe(struct nvmf_capsule * nc)168 nvmf_capsule_sqe(struct nvmf_capsule *nc)
169 {
170 KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
171 ("%s: capsule %p is not a command capsule", __func__, nc));
172 return (&nc->nc_sqe);
173 }
174
175 void *
nvmf_capsule_cqe(struct nvmf_capsule * nc)176 nvmf_capsule_cqe(struct nvmf_capsule *nc)
177 {
178 KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
179 ("%s: capsule %p is not a response capsule", __func__, nc));
180 return (&nc->nc_cqe);
181 }
182
183 bool
nvmf_sqhd_valid(struct nvmf_capsule * nc)184 nvmf_sqhd_valid(struct nvmf_capsule *nc)
185 {
186 KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
187 ("%s: capsule %p is not a response capsule", __func__, nc));
188 return (nc->nc_sqhd_valid);
189 }
190
191 uint64_t
nvmf_max_xfer_size(struct nvmf_qpair * qp)192 nvmf_max_xfer_size(struct nvmf_qpair *qp)
193 {
194 return (qp->nq_ops->max_xfer_size(qp));
195 }
196
197 uint32_t
nvmf_max_ioccsz(struct nvmf_qpair * qp)198 nvmf_max_ioccsz(struct nvmf_qpair *qp)
199 {
200 return (qp->nq_ops->max_ioccsz(qp));
201 }
202
203 uint8_t
nvmf_validate_command_capsule(struct nvmf_capsule * nc)204 nvmf_validate_command_capsule(struct nvmf_capsule *nc)
205 {
206 KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
207 ("%s: capsule %p is not a command capsule", __func__, nc));
208
209 if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
210 return (NVME_SC_INVALID_FIELD);
211
212 return (nc->nc_qpair->nq_ops->validate_command_capsule(nc));
213 }
214
215 size_t
nvmf_capsule_data_len(const struct nvmf_capsule * nc)216 nvmf_capsule_data_len(const struct nvmf_capsule *nc)
217 {
218 return (nc->nc_qpair->nq_ops->capsule_data_len(nc));
219 }
220
221 int
nvmf_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct memdesc * mem,size_t len,nvmf_io_complete_t * complete_cb,void * cb_arg)222 nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
223 struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb,
224 void *cb_arg)
225 {
226 struct nvmf_io_request io;
227
228 io.io_mem = *mem;
229 io.io_len = len;
230 io.io_complete = complete_cb;
231 io.io_complete_arg = cb_arg;
232 return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset,
233 &io));
234 }
235
236 u_int
nvmf_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)237 nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
238 struct mbuf *m, size_t len)
239 {
240 MPASS(m_length(m, NULL) == len);
241 return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m,
242 len));
243 }
244
245 int
nvmf_pack_ioc_nvlist(const nvlist_t * nvl,struct nvmf_ioc_nv * nv)246 nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv)
247 {
248 void *packed;
249 int error;
250
251 error = nvlist_error(nvl);
252 if (error != 0)
253 return (error);
254
255 if (nv->size == 0) {
256 nv->len = nvlist_size(nvl);
257 } else {
258 packed = nvlist_pack(nvl, &nv->len);
259 if (packed == NULL)
260 error = ENOMEM;
261 else if (nv->len > nv->size)
262 error = EFBIG;
263 else
264 error = copyout(packed, nv->data, nv->len);
265 free(packed, M_NVLIST);
266 }
267 return (error);
268 }
269
270 int
nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv * nv,nvlist_t ** nvlp)271 nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
272 {
273 void *packed;
274 nvlist_t *nvl;
275 int error;
276
277 packed = malloc(nv->size, M_NVMF_TRANSPORT, M_WAITOK);
278 error = copyin(nv->data, packed, nv->size);
279 if (error != 0) {
280 free(packed, M_NVMF_TRANSPORT);
281 return (error);
282 }
283
284 nvl = nvlist_unpack(packed, nv->size, 0);
285 free(packed, M_NVMF_TRANSPORT);
286 if (nvl == NULL)
287 return (EINVAL);
288
289 *nvlp = nvl;
290 return (0);
291 }
292
293 bool
nvmf_validate_qpair_nvlist(const nvlist_t * nvl,bool controller)294 nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller)
295 {
296 uint64_t value, qsize;
297 bool admin, valid;
298
299 valid = true;
300 valid &= nvlist_exists_bool(nvl, "admin");
301 valid &= nvlist_exists_bool(nvl, "sq_flow_control");
302 valid &= nvlist_exists_number(nvl, "qsize");
303 valid &= nvlist_exists_number(nvl, "sqhd");
304 if (!controller)
305 valid &= nvlist_exists_number(nvl, "sqtail");
306 if (!valid)
307 return (false);
308
309 admin = nvlist_get_bool(nvl, "admin");
310 qsize = nvlist_get_number(nvl, "qsize");
311 if (admin) {
312 if (qsize < NVME_MIN_ADMIN_ENTRIES ||
313 qsize > NVME_MAX_ADMIN_ENTRIES)
314 return (false);
315 } else {
316 if (qsize < NVME_MIN_IO_ENTRIES || qsize > NVME_MAX_IO_ENTRIES)
317 return (false);
318 }
319 value = nvlist_get_number(nvl, "sqhd");
320 if (value > qsize - 1)
321 return (false);
322 if (!controller) {
323 value = nvlist_get_number(nvl, "sqtail");
324 if (value > qsize - 1)
325 return (false);
326 }
327
328 return (true);
329 }
330
331 int
nvmf_transport_module_handler(struct module * mod,int what,void * arg)332 nvmf_transport_module_handler(struct module *mod, int what, void *arg)
333 {
334 struct nvmf_transport_ops *ops = arg;
335 struct nvmf_transport *nt, *nt2, *prev;
336 int error;
337
338 switch (what) {
339 case MOD_LOAD:
340 if (!nvmf_supported_trtype(ops->trtype)) {
341 printf("NVMF: Unsupported transport %u", ops->trtype);
342 return (EINVAL);
343 }
344
345 nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO);
346 nt->nt_ops = arg;
347
348 sx_xlock(&nvmf_transports_lock);
349 if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) {
350 SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt,
351 nt_link);
352 } else {
353 prev = NULL;
354 SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype],
355 nt_link) {
356 if (ops->priority > nt2->nt_ops->priority)
357 break;
358 prev = nt2;
359 }
360 if (prev == NULL)
361 SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype],
362 nt, nt_link);
363 else
364 SLIST_INSERT_AFTER(prev, nt, nt_link);
365 }
366 sx_xunlock(&nvmf_transports_lock);
367 return (0);
368
369 case MOD_QUIESCE:
370 if (!nvmf_supported_trtype(ops->trtype))
371 return (0);
372
373 sx_slock(&nvmf_transports_lock);
374 SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
375 if (nt->nt_ops == ops)
376 break;
377 }
378 if (nt == NULL) {
379 sx_sunlock(&nvmf_transports_lock);
380 return (0);
381 }
382 if (nt->nt_active_qpairs != 0) {
383 sx_sunlock(&nvmf_transports_lock);
384 return (EBUSY);
385 }
386 sx_sunlock(&nvmf_transports_lock);
387 return (0);
388
389 case MOD_UNLOAD:
390 if (!nvmf_supported_trtype(ops->trtype))
391 return (0);
392
393 sx_xlock(&nvmf_transports_lock);
394 prev = NULL;
395 SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
396 if (nt->nt_ops == ops)
397 break;
398 prev = nt;
399 }
400 if (nt == NULL) {
401 sx_xunlock(&nvmf_transports_lock);
402 return (0);
403 }
404
405 if (prev == NULL)
406 SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype],
407 nt_link);
408 else
409 SLIST_REMOVE_AFTER(prev, nt_link);
410
411 error = 0;
412 while (nt->nt_active_qpairs != 0 && error == 0)
413 error = sx_sleep(nt, &nvmf_transports_lock, PCATCH,
414 "nftunld", 0);
415 sx_xunlock(&nvmf_transports_lock);
416 if (error != 0)
417 return (error);
418 free(nt, M_NVMF_TRANSPORT);
419 return (0);
420
421 default:
422 return (EOPNOTSUPP);
423 }
424 }
425
426 static int
nvmf_transport_modevent(module_t mod __unused,int what,void * arg __unused)427 nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused)
428 {
429 switch (what) {
430 case MOD_LOAD:
431 for (u_int i = 0; i < nitems(nvmf_transports); i++)
432 SLIST_INIT(&nvmf_transports[i]);
433 sx_init(&nvmf_transports_lock, "nvmf transports");
434 return (0);
435 default:
436 return (EOPNOTSUPP);
437 }
438 }
439
440 static moduledata_t nvmf_transport_mod = {
441 "nvmf_transport",
442 nvmf_transport_modevent,
443 0
444 };
445
446 DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS,
447 SI_ORDER_FIRST);
448 MODULE_VERSION(nvmf_transport, 1);
449