1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bio.h> 10 #include <sys/bus.h> 11 #include <sys/conf.h> 12 #include <sys/disk.h> 13 #include <sys/fcntl.h> 14 #include <sys/lock.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/mutex.h> 18 #include <sys/proc.h> 19 #include <sys/refcount.h> 20 #include <sys/sbuf.h> 21 #include <machine/stdarg.h> 22 #include <dev/nvme/nvme.h> 23 #include <dev/nvmf/host/nvmf_var.h> 24 25 struct nvmf_namespace { 26 struct nvmf_softc *sc; 27 uint64_t size; 28 uint32_t id; 29 u_int flags; 30 uint32_t lba_size; 31 bool disconnected; 32 33 TAILQ_HEAD(, bio) pending_bios; 34 struct mtx lock; 35 volatile u_int active_bios; 36 37 struct cdev *cdev; 38 }; 39 40 static void nvmf_ns_strategy(struct bio *bio); 41 42 static void 43 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) 44 { 45 char buf[128]; 46 struct sbuf sb; 47 va_list ap; 48 49 sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 50 sbuf_set_drain(&sb, sbuf_printf_drain, NULL); 51 52 sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev), 53 ns->id); 54 55 va_start(ap, fmt); 56 sbuf_vprintf(&sb, fmt, ap); 57 va_end(ap); 58 59 sbuf_finish(&sb); 60 sbuf_delete(&sb); 61 } 62 63 /* 64 * The I/O completion may trigger after the received CQE if the I/O 65 * used a zero-copy mbuf that isn't harvested until after the NIC 66 * driver processes TX completions. Abuse bio_driver1 as a refcount. 67 * Store I/O errors in bio_driver2. 68 */ 69 static __inline u_int * 70 bio_refs(struct bio *bio) 71 { 72 return ((u_int *)&bio->bio_driver1); 73 } 74 75 static void 76 nvmf_ns_biodone(struct bio *bio) 77 { 78 struct nvmf_namespace *ns; 79 int error; 80 81 if (!refcount_release(bio_refs(bio))) 82 return; 83 84 ns = bio->bio_dev->si_drv1; 85 86 /* If a request is aborted, resubmit or queue it for resubmission. */ 87 if (bio->bio_error == ECONNABORTED) { 88 bio->bio_error = 0; 89 bio->bio_driver2 = 0; 90 mtx_lock(&ns->lock); 91 if (ns->disconnected) { 92 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 93 mtx_unlock(&ns->lock); 94 } else { 95 mtx_unlock(&ns->lock); 96 nvmf_ns_strategy(bio); 97 } 98 } else { 99 /* 100 * I/O errors take precedence over generic EIO from 101 * CQE errors. 102 */ 103 error = (intptr_t)bio->bio_driver2; 104 if (error != 0) 105 bio->bio_error = error; 106 if (bio->bio_error != 0) 107 bio->bio_flags |= BIO_ERROR; 108 biodone(bio); 109 } 110 111 if (refcount_release(&ns->active_bios)) 112 wakeup(ns); 113 } 114 115 static void 116 nvmf_ns_io_complete(void *arg, size_t xfered, int error) 117 { 118 struct bio *bio = arg; 119 120 KASSERT(xfered <= bio->bio_bcount, 121 ("%s: xfered > bio_bcount", __func__)); 122 123 bio->bio_driver2 = (void *)(intptr_t)error; 124 bio->bio_resid = bio->bio_bcount - xfered; 125 126 nvmf_ns_biodone(bio); 127 } 128 129 static void 130 nvmf_ns_delete_complete(void *arg, size_t xfered, int error) 131 { 132 struct bio *bio = arg; 133 134 if (error != 0) 135 bio->bio_resid = bio->bio_bcount; 136 else 137 bio->bio_resid = 0; 138 139 free(bio->bio_driver2, M_NVMF); 140 bio->bio_driver2 = (void *)(intptr_t)error; 141 142 nvmf_ns_biodone(bio); 143 } 144 145 static void 146 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) 147 { 148 struct bio *bio = arg; 149 150 if (nvmf_cqe_aborted(cqe)) 151 bio->bio_error = ECONNABORTED; 152 else if (cqe->status != 0) 153 bio->bio_error = EIO; 154 155 nvmf_ns_biodone(bio); 156 } 157 158 static int 159 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) 160 { 161 struct nvme_command cmd; 162 struct nvmf_request *req; 163 struct nvme_dsm_range *dsm_range; 164 struct memdesc mem; 165 uint64_t lba, lba_count; 166 167 dsm_range = NULL; 168 memset(&cmd, 0, sizeof(cmd)); 169 switch (bio->bio_cmd) { 170 case BIO_READ: 171 lba = bio->bio_offset / ns->lba_size; 172 lba_count = bio->bio_bcount / ns->lba_size; 173 nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); 174 break; 175 case BIO_WRITE: 176 lba = bio->bio_offset / ns->lba_size; 177 lba_count = bio->bio_bcount / ns->lba_size; 178 nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); 179 break; 180 case BIO_FLUSH: 181 nvme_ns_flush_cmd(&cmd, ns->id); 182 break; 183 case BIO_DELETE: 184 dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | 185 M_ZERO); 186 if (dsm_range == NULL) 187 return (ENOMEM); 188 lba = bio->bio_offset / ns->lba_size; 189 lba_count = bio->bio_bcount / ns->lba_size; 190 dsm_range->starting_lba = htole64(lba); 191 dsm_range->length = htole32(lba_count); 192 193 cmd.opc = NVME_OPC_DATASET_MANAGEMENT; 194 cmd.nsid = htole32(ns->id); 195 cmd.cdw10 = htole32(0); /* 1 range */ 196 cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); 197 break; 198 default: 199 return (EOPNOTSUPP); 200 } 201 202 mtx_lock(&ns->lock); 203 if (ns->disconnected) { 204 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 205 mtx_unlock(&ns->lock); 206 free(dsm_range, M_NVMF); 207 return (0); 208 } 209 210 req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, 211 nvmf_ns_bio_complete, bio, M_NOWAIT); 212 if (req == NULL) { 213 mtx_unlock(&ns->lock); 214 free(dsm_range, M_NVMF); 215 return (ENOMEM); 216 } 217 218 switch (bio->bio_cmd) { 219 case BIO_READ: 220 case BIO_WRITE: 221 refcount_init(bio_refs(bio), 2); 222 mem = memdesc_bio(bio); 223 nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, 224 bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); 225 break; 226 case BIO_DELETE: 227 refcount_init(bio_refs(bio), 2); 228 mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); 229 nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), 230 true, nvmf_ns_delete_complete, bio); 231 bio->bio_driver2 = dsm_range; 232 break; 233 default: 234 refcount_init(bio_refs(bio), 1); 235 KASSERT(bio->bio_resid == 0, 236 ("%s: input bio_resid != 0", __func__)); 237 break; 238 } 239 240 refcount_acquire(&ns->active_bios); 241 nvmf_submit_request(req); 242 mtx_unlock(&ns->lock); 243 return (0); 244 } 245 246 static int 247 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, 248 struct thread *td) 249 { 250 struct nvmf_namespace *ns = dev->si_drv1; 251 struct nvme_get_nsid *gnsid; 252 struct nvme_pt_command *pt; 253 254 switch (cmd) { 255 case NVME_PASSTHROUGH_CMD: 256 pt = (struct nvme_pt_command *)arg; 257 pt->cmd.nsid = htole32(ns->id); 258 return (nvmf_passthrough_cmd(ns->sc, pt, false)); 259 case NVME_GET_NSID: 260 gnsid = (struct nvme_get_nsid *)arg; 261 strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), 262 sizeof(gnsid->cdev)); 263 gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; 264 gnsid->nsid = ns->id; 265 return (0); 266 case DIOCGMEDIASIZE: 267 *(off_t *)arg = ns->size; 268 return (0); 269 case DIOCGSECTORSIZE: 270 *(u_int *)arg = ns->lba_size; 271 return (0); 272 default: 273 return (ENOTTY); 274 } 275 } 276 277 static int 278 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 279 { 280 int error; 281 282 error = 0; 283 if ((oflags & FWRITE) != 0) 284 error = securelevel_gt(td->td_ucred, 0); 285 return (error); 286 } 287 288 void 289 nvmf_ns_strategy(struct bio *bio) 290 { 291 struct nvmf_namespace *ns; 292 int error; 293 294 ns = bio->bio_dev->si_drv1; 295 296 error = nvmf_ns_submit_bio(ns, bio); 297 if (error != 0) { 298 bio->bio_error = error; 299 bio->bio_flags |= BIO_ERROR; 300 bio->bio_resid = bio->bio_bcount; 301 biodone(bio); 302 } 303 } 304 305 static struct cdevsw nvmf_ns_cdevsw = { 306 .d_version = D_VERSION, 307 .d_flags = D_DISK, 308 .d_open = nvmf_ns_open, 309 .d_read = physread, 310 .d_write = physwrite, 311 .d_strategy = nvmf_ns_strategy, 312 .d_ioctl = nvmf_ns_ioctl 313 }; 314 315 struct nvmf_namespace * 316 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, 317 struct nvme_namespace_data *data) 318 { 319 struct make_dev_args mda; 320 struct nvmf_namespace *ns; 321 int error; 322 uint8_t lbads, lbaf; 323 324 ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); 325 ns->sc = sc; 326 ns->id = id; 327 TAILQ_INIT(&ns->pending_bios); 328 mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); 329 330 /* One dummy bio avoids dropping to 0 until destroy. */ 331 refcount_init(&ns->active_bios, 1); 332 333 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 334 ns_printf(ns, "End-to-end data protection not supported\n"); 335 goto fail; 336 } 337 338 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 339 if (lbaf > data->nlbaf) { 340 ns_printf(ns, "Invalid LBA format index\n"); 341 goto fail; 342 } 343 344 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 345 ns_printf(ns, "Namespaces with metadata are not supported\n"); 346 goto fail; 347 } 348 349 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 350 if (lbads == 0) { 351 ns_printf(ns, "Invalid LBA format index\n"); 352 goto fail; 353 } 354 355 ns->lba_size = 1 << lbads; 356 ns->size = data->nsze * ns->lba_size; 357 358 if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) 359 ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; 360 361 if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) 362 ns->flags |= NVME_NS_FLUSH_SUPPORTED; 363 364 /* 365 * XXX: Does any of the boundary splitting for NOIOB make any 366 * sense for Fabrics? 367 */ 368 369 make_dev_args_init(&mda); 370 mda.mda_devsw = &nvmf_ns_cdevsw; 371 mda.mda_uid = UID_ROOT; 372 mda.mda_gid = GID_WHEEL; 373 mda.mda_mode = 0600; 374 mda.mda_si_drv1 = ns; 375 error = make_dev_s(&mda, &ns->cdev, "%sns%u", 376 device_get_nameunit(sc->dev), id); 377 if (error != 0) 378 goto fail; 379 380 ns->cdev->si_flags |= SI_UNMAPPED; 381 382 return (ns); 383 fail: 384 mtx_destroy(&ns->lock); 385 free(ns, M_NVMF); 386 return (NULL); 387 } 388 389 void 390 nvmf_disconnect_ns(struct nvmf_namespace *ns) 391 { 392 mtx_lock(&ns->lock); 393 ns->disconnected = true; 394 mtx_unlock(&ns->lock); 395 } 396 397 void 398 nvmf_reconnect_ns(struct nvmf_namespace *ns) 399 { 400 TAILQ_HEAD(, bio) bios; 401 struct bio *bio; 402 403 mtx_lock(&ns->lock); 404 ns->disconnected = false; 405 TAILQ_INIT(&bios); 406 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 407 mtx_unlock(&ns->lock); 408 409 while (!TAILQ_EMPTY(&bios)) { 410 bio = TAILQ_FIRST(&bios); 411 TAILQ_REMOVE(&bios, bio, bio_queue); 412 nvmf_ns_strategy(bio); 413 } 414 } 415 416 void 417 nvmf_destroy_ns(struct nvmf_namespace *ns) 418 { 419 TAILQ_HEAD(, bio) bios; 420 struct bio *bio; 421 422 destroy_dev(ns->cdev); 423 424 /* 425 * Wait for active I/O requests to drain. The release drops 426 * the reference on the "dummy bio" when the namespace is 427 * created. 428 */ 429 mtx_lock(&ns->lock); 430 if (!refcount_release(&ns->active_bios)) { 431 while (ns->active_bios != 0) 432 mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); 433 } 434 435 /* Abort any pending I/O requests. */ 436 TAILQ_INIT(&bios); 437 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 438 mtx_unlock(&ns->lock); 439 440 while (!TAILQ_EMPTY(&bios)) { 441 bio = TAILQ_FIRST(&bios); 442 TAILQ_REMOVE(&bios, bio, bio_queue); 443 bio->bio_error = ECONNABORTED; 444 bio->bio_flags |= BIO_ERROR; 445 bio->bio_resid = bio->bio_bcount; 446 biodone(bio); 447 } 448 449 mtx_destroy(&ns->lock); 450 free(ns, M_NVMF); 451 } 452 453 bool 454 nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) 455 { 456 uint8_t lbads, lbaf; 457 458 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 459 ns_printf(ns, "End-to-end data protection not supported\n"); 460 return (false); 461 } 462 463 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 464 if (lbaf > data->nlbaf) { 465 ns_printf(ns, "Invalid LBA format index\n"); 466 return (false); 467 } 468 469 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 470 ns_printf(ns, "Namespaces with metadata are not supported\n"); 471 return (false); 472 } 473 474 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 475 if (lbads == 0) { 476 ns_printf(ns, "Invalid LBA format index\n"); 477 return (false); 478 } 479 480 ns->lba_size = 1 << lbads; 481 ns->size = data->nsze * ns->lba_size; 482 return (true); 483 } 484