1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/kernel.h> 10 #include <sys/limits.h> 11 #include <sys/lock.h> 12 #include <sys/malloc.h> 13 #include <sys/mbuf.h> 14 #include <sys/module.h> 15 #include <sys/nv.h> 16 #include <sys/refcount.h> 17 #include <sys/sysctl.h> 18 #include <sys/sx.h> 19 #include <dev/nvme/nvme.h> 20 #include <dev/nvmf/nvmf.h> 21 #include <dev/nvmf/nvmf_transport.h> 22 #include <dev/nvmf/nvmf_transport_internal.h> 23 24 /* Transport-independent support for fabrics queue pairs and commands. */ 25 26 struct nvmf_transport { 27 struct nvmf_transport_ops *nt_ops; 28 29 volatile u_int nt_active_qpairs; 30 SLIST_ENTRY(nvmf_transport) nt_link; 31 }; 32 33 /* nvmf_transports[nvmf_trtype] is sorted by priority */ 34 static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1]; 35 static struct sx nvmf_transports_lock; 36 37 static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport", 38 "NVMe over Fabrics transport"); 39 40 SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 41 "NVMe over Fabrics"); 42 43 static bool 44 nvmf_supported_trtype(enum nvmf_trtype trtype) 45 { 46 return (trtype < nitems(nvmf_transports)); 47 } 48 49 struct nvmf_qpair * 50 nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller, 51 const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg, 52 nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg) 53 { 54 struct nvmf_transport *nt; 55 struct nvmf_qpair *qp; 56 57 if (!nvmf_supported_trtype(trtype)) 58 return (NULL); 59 60 sx_slock(&nvmf_transports_lock); 61 SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) { 62 qp = nt->nt_ops->allocate_qpair(controller, params); 63 if (qp != NULL) { 64 refcount_acquire(&nt->nt_active_qpairs); 65 break; 66 } 67 } 68 sx_sunlock(&nvmf_transports_lock); 69 if (qp == NULL) 70 return (NULL); 71 72 qp->nq_transport = nt; 73 qp->nq_ops = nt->nt_ops; 74 qp->nq_controller = controller; 75 qp->nq_error = error_cb; 76 qp->nq_error_arg = error_cb_arg; 77 qp->nq_receive = receive_cb; 78 qp->nq_receive_arg = receive_cb_arg; 79 qp->nq_admin = nvlist_get_bool(params, "admin"); 80 return (qp); 81 } 82 83 void 84 nvmf_free_qpair(struct nvmf_qpair *qp) 85 { 86 struct nvmf_transport *nt; 87 88 nt = qp->nq_transport; 89 qp->nq_ops->free_qpair(qp); 90 if (refcount_release(&nt->nt_active_qpairs)) 91 wakeup(nt); 92 } 93 94 struct nvmf_capsule * 95 nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how) 96 { 97 struct nvmf_capsule *nc; 98 99 KASSERT(how == M_WAITOK || how == M_NOWAIT, 100 ("%s: invalid how", __func__)); 101 nc = qp->nq_ops->allocate_capsule(qp, how); 102 if (nc == NULL) 103 return (NULL); 104 105 nc->nc_qpair = qp; 106 nc->nc_qe_len = sizeof(struct nvme_command); 107 memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len); 108 109 /* 4.2 of NVMe base spec: Fabrics always uses SGL. */ 110 nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT); 111 nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL); 112 return (nc); 113 } 114 115 struct nvmf_capsule * 116 nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how) 117 { 118 struct nvmf_capsule *nc; 119 120 KASSERT(how == M_WAITOK || how == M_NOWAIT, 121 ("%s: invalid how", __func__)); 122 nc = qp->nq_ops->allocate_capsule(qp, how); 123 if (nc == NULL) 124 return (NULL); 125 126 nc->nc_qpair = qp; 127 nc->nc_qe_len = sizeof(struct nvme_completion); 128 memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len); 129 return (nc); 130 } 131 132 int 133 nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem, 134 size_t len, bool send, nvmf_io_complete_t *complete_cb, 135 void *cb_arg) 136 { 137 if (nc->nc_data.io_len != 0) 138 return (EBUSY); 139 140 nc->nc_send_data = send; 141 nc->nc_data.io_mem = *mem; 142 nc->nc_data.io_len = len; 143 nc->nc_data.io_complete = complete_cb; 144 nc->nc_data.io_complete_arg = cb_arg; 145 return (0); 146 } 147 148 void 149 nvmf_free_capsule(struct nvmf_capsule *nc) 150 { 151 nc->nc_qpair->nq_ops->free_capsule(nc); 152 } 153 154 int 155 nvmf_transmit_capsule(struct nvmf_capsule *nc) 156 { 157 return (nc->nc_qpair->nq_ops->transmit_capsule(nc)); 158 } 159 160 void 161 nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error) 162 { 163 if (nc->nc_data.io_len != 0) 164 nvmf_complete_io_request(&nc->nc_data, 0, error); 165 } 166 167 void * 168 nvmf_capsule_sqe(struct nvmf_capsule *nc) 169 { 170 KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), 171 ("%s: capsule %p is not a command capsule", __func__, nc)); 172 return (&nc->nc_sqe); 173 } 174 175 void * 176 nvmf_capsule_cqe(struct nvmf_capsule *nc) 177 { 178 KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), 179 ("%s: capsule %p is not a response capsule", __func__, nc)); 180 return (&nc->nc_cqe); 181 } 182 183 bool 184 nvmf_sqhd_valid(struct nvmf_capsule *nc) 185 { 186 KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), 187 ("%s: capsule %p is not a response capsule", __func__, nc)); 188 return (nc->nc_sqhd_valid); 189 } 190 191 uint64_t 192 nvmf_max_xfer_size(struct nvmf_qpair *qp) 193 { 194 return (qp->nq_ops->max_xfer_size(qp)); 195 } 196 197 uint32_t 198 nvmf_max_ioccsz(struct nvmf_qpair *qp) 199 { 200 return (qp->nq_ops->max_ioccsz(qp)); 201 } 202 203 uint8_t 204 nvmf_validate_command_capsule(struct nvmf_capsule *nc) 205 { 206 KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), 207 ("%s: capsule %p is not a command capsule", __func__, nc)); 208 209 if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL) 210 return (NVME_SC_INVALID_FIELD); 211 212 return (nc->nc_qpair->nq_ops->validate_command_capsule(nc)); 213 } 214 215 size_t 216 nvmf_capsule_data_len(const struct nvmf_capsule *nc) 217 { 218 return (nc->nc_qpair->nq_ops->capsule_data_len(nc)); 219 } 220 221 int 222 nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 223 struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb, 224 void *cb_arg) 225 { 226 struct nvmf_io_request io; 227 228 io.io_mem = *mem; 229 io.io_len = len; 230 io.io_complete = complete_cb; 231 io.io_complete_arg = cb_arg; 232 return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset, 233 &io)); 234 } 235 236 u_int 237 nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, 238 struct mbuf *m, size_t len) 239 { 240 MPASS(m_length(m, NULL) == len); 241 return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m, 242 len)); 243 } 244 245 int 246 nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv) 247 { 248 void *packed; 249 int error; 250 251 error = nvlist_error(nvl); 252 if (error != 0) 253 return (error); 254 255 if (nv->size == 0) { 256 nv->len = nvlist_size(nvl); 257 } else { 258 packed = nvlist_pack(nvl, &nv->len); 259 if (packed == NULL) 260 error = ENOMEM; 261 else if (nv->len > nv->size) 262 error = EFBIG; 263 else 264 error = copyout(packed, nv->data, nv->len); 265 free(packed, M_NVLIST); 266 } 267 return (error); 268 } 269 270 int 271 nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) 272 { 273 void *packed; 274 nvlist_t *nvl; 275 int error; 276 277 packed = malloc(nv->size, M_NVMF_TRANSPORT, M_WAITOK); 278 error = copyin(nv->data, packed, nv->size); 279 if (error != 0) { 280 free(packed, M_NVMF_TRANSPORT); 281 return (error); 282 } 283 284 nvl = nvlist_unpack(packed, nv->size, 0); 285 free(packed, M_NVMF_TRANSPORT); 286 if (nvl == NULL) 287 return (EINVAL); 288 289 *nvlp = nvl; 290 return (0); 291 } 292 293 bool 294 nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller) 295 { 296 uint64_t value, qsize; 297 bool admin, valid; 298 299 valid = true; 300 valid &= nvlist_exists_bool(nvl, "admin"); 301 valid &= nvlist_exists_bool(nvl, "sq_flow_control"); 302 valid &= nvlist_exists_number(nvl, "qsize"); 303 valid &= nvlist_exists_number(nvl, "sqhd"); 304 if (!controller) 305 valid &= nvlist_exists_number(nvl, "sqtail"); 306 if (!valid) 307 return (false); 308 309 admin = nvlist_get_bool(nvl, "admin"); 310 qsize = nvlist_get_number(nvl, "qsize"); 311 if (admin) { 312 if (qsize < NVME_MIN_ADMIN_ENTRIES || 313 qsize > NVME_MAX_ADMIN_ENTRIES) 314 return (false); 315 } else { 316 if (qsize < NVME_MIN_IO_ENTRIES || qsize > NVME_MAX_IO_ENTRIES) 317 return (false); 318 } 319 value = nvlist_get_number(nvl, "sqhd"); 320 if (value > qsize - 1) 321 return (false); 322 if (!controller) { 323 value = nvlist_get_number(nvl, "sqtail"); 324 if (value > qsize - 1) 325 return (false); 326 } 327 328 return (true); 329 } 330 331 int 332 nvmf_transport_module_handler(struct module *mod, int what, void *arg) 333 { 334 struct nvmf_transport_ops *ops = arg; 335 struct nvmf_transport *nt, *nt2, *prev; 336 int error; 337 338 switch (what) { 339 case MOD_LOAD: 340 if (!nvmf_supported_trtype(ops->trtype)) { 341 printf("NVMF: Unsupported transport %u", ops->trtype); 342 return (EINVAL); 343 } 344 345 nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO); 346 nt->nt_ops = arg; 347 348 sx_xlock(&nvmf_transports_lock); 349 if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) { 350 SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt, 351 nt_link); 352 } else { 353 prev = NULL; 354 SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype], 355 nt_link) { 356 if (ops->priority > nt2->nt_ops->priority) 357 break; 358 prev = nt2; 359 } 360 if (prev == NULL) 361 SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], 362 nt, nt_link); 363 else 364 SLIST_INSERT_AFTER(prev, nt, nt_link); 365 } 366 sx_xunlock(&nvmf_transports_lock); 367 return (0); 368 369 case MOD_QUIESCE: 370 if (!nvmf_supported_trtype(ops->trtype)) 371 return (0); 372 373 sx_slock(&nvmf_transports_lock); 374 SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { 375 if (nt->nt_ops == ops) 376 break; 377 } 378 if (nt == NULL) { 379 sx_sunlock(&nvmf_transports_lock); 380 return (0); 381 } 382 if (nt->nt_active_qpairs != 0) { 383 sx_sunlock(&nvmf_transports_lock); 384 return (EBUSY); 385 } 386 sx_sunlock(&nvmf_transports_lock); 387 return (0); 388 389 case MOD_UNLOAD: 390 if (!nvmf_supported_trtype(ops->trtype)) 391 return (0); 392 393 sx_xlock(&nvmf_transports_lock); 394 prev = NULL; 395 SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { 396 if (nt->nt_ops == ops) 397 break; 398 prev = nt; 399 } 400 if (nt == NULL) { 401 sx_xunlock(&nvmf_transports_lock); 402 return (0); 403 } 404 405 if (prev == NULL) 406 SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype], 407 nt_link); 408 else 409 SLIST_REMOVE_AFTER(prev, nt_link); 410 411 error = 0; 412 while (nt->nt_active_qpairs != 0 && error == 0) 413 error = sx_sleep(nt, &nvmf_transports_lock, PCATCH, 414 "nftunld", 0); 415 sx_xunlock(&nvmf_transports_lock); 416 if (error != 0) 417 return (error); 418 free(nt, M_NVMF_TRANSPORT); 419 return (0); 420 421 default: 422 return (EOPNOTSUPP); 423 } 424 } 425 426 static int 427 nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused) 428 { 429 switch (what) { 430 case MOD_LOAD: 431 for (u_int i = 0; i < nitems(nvmf_transports); i++) 432 SLIST_INIT(&nvmf_transports[i]); 433 sx_init(&nvmf_transports_lock, "nvmf transports"); 434 return (0); 435 default: 436 return (EOPNOTSUPP); 437 } 438 } 439 440 static moduledata_t nvmf_transport_mod = { 441 "nvmf_transport", 442 nvmf_transport_modevent, 443 0 444 }; 445 446 DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS, 447 SI_ORDER_FIRST); 448 MODULE_VERSION(nvmf_transport, 1); 449