1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023-2024 Chelsio Communications, Inc. 5 * Written by: John Baldwin <jhb@FreeBSD.org> 6 */ 7 8 #include <sys/param.h> 9 #include <sys/bio.h> 10 #include <sys/bus.h> 11 #include <sys/conf.h> 12 #include <sys/disk.h> 13 #include <sys/fcntl.h> 14 #include <sys/lock.h> 15 #include <sys/malloc.h> 16 #include <sys/memdesc.h> 17 #include <sys/mutex.h> 18 #include <sys/proc.h> 19 #include <sys/refcount.h> 20 #include <sys/sbuf.h> 21 #include <machine/stdarg.h> 22 #include <dev/nvme/nvme.h> 23 #include <dev/nvmf/host/nvmf_var.h> 24 25 struct nvmf_namespace { 26 struct nvmf_softc *sc; 27 uint64_t size; 28 uint32_t id; 29 u_int flags; 30 uint32_t lba_size; 31 bool disconnected; 32 33 TAILQ_HEAD(, bio) pending_bios; 34 struct mtx lock; 35 volatile u_int active_bios; 36 37 struct cdev *cdev; 38 }; 39 40 static void nvmf_ns_strategy(struct bio *bio); 41 42 static void 43 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) 44 { 45 char buf[128]; 46 struct sbuf sb; 47 va_list ap; 48 49 sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 50 sbuf_set_drain(&sb, sbuf_printf_drain, NULL); 51 52 sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev), 53 ns->id); 54 55 va_start(ap, fmt); 56 sbuf_vprintf(&sb, fmt, ap); 57 va_end(ap); 58 59 sbuf_finish(&sb); 60 sbuf_delete(&sb); 61 } 62 63 /* 64 * The I/O completion may trigger after the received CQE if the I/O 65 * used a zero-copy mbuf that isn't harvested until after the NIC 66 * driver processes TX completions. Abuse bio_driver1 as a refcount. 67 * Store I/O errors in bio_driver2. 68 */ 69 static __inline u_int * 70 bio_refs(struct bio *bio) 71 { 72 return ((u_int *)&bio->bio_driver1); 73 } 74 75 static void 76 nvmf_ns_biodone(struct bio *bio) 77 { 78 struct nvmf_namespace *ns; 79 int error; 80 81 if (!refcount_release(bio_refs(bio))) 82 return; 83 84 ns = bio->bio_dev->si_drv1; 85 86 /* If a request is aborted, resubmit or queue it for resubmission. */ 87 if (bio->bio_error == ECONNABORTED) { 88 bio->bio_error = 0; 89 bio->bio_driver2 = 0; 90 mtx_lock(&ns->lock); 91 if (ns->disconnected) { 92 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 93 mtx_unlock(&ns->lock); 94 } else { 95 mtx_unlock(&ns->lock); 96 nvmf_ns_strategy(bio); 97 } 98 } else { 99 /* 100 * I/O errors take precedence over generic EIO from 101 * CQE errors. 102 */ 103 error = (intptr_t)bio->bio_driver2; 104 if (error != 0) 105 bio->bio_error = error; 106 if (bio->bio_error != 0) 107 bio->bio_flags |= BIO_ERROR; 108 biodone(bio); 109 } 110 111 if (refcount_release(&ns->active_bios)) 112 wakeup(ns); 113 } 114 115 static void 116 nvmf_ns_io_complete(void *arg, size_t xfered, int error) 117 { 118 struct bio *bio = arg; 119 120 KASSERT(xfered <= bio->bio_bcount, 121 ("%s: xfered > bio_bcount", __func__)); 122 123 bio->bio_driver2 = (void *)(intptr_t)error; 124 bio->bio_resid = bio->bio_bcount - xfered; 125 126 nvmf_ns_biodone(bio); 127 } 128 129 static void 130 nvmf_ns_delete_complete(void *arg, size_t xfered, int error) 131 { 132 struct bio *bio = arg; 133 134 if (error != 0) 135 bio->bio_resid = bio->bio_bcount; 136 else 137 bio->bio_resid = 0; 138 139 free(bio->bio_driver2, M_NVMF); 140 bio->bio_driver2 = (void *)(intptr_t)error; 141 142 nvmf_ns_biodone(bio); 143 } 144 145 static void 146 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) 147 { 148 struct bio *bio = arg; 149 150 if (nvmf_cqe_aborted(cqe)) 151 bio->bio_error = ECONNABORTED; 152 else if (cqe->status != 0) 153 bio->bio_error = EIO; 154 155 nvmf_ns_biodone(bio); 156 } 157 158 static int 159 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) 160 { 161 struct nvme_command cmd; 162 struct nvmf_request *req; 163 struct nvme_dsm_range *dsm_range; 164 struct memdesc mem; 165 uint64_t lba, lba_count; 166 167 dsm_range = NULL; 168 memset(&cmd, 0, sizeof(cmd)); 169 switch (bio->bio_cmd) { 170 case BIO_READ: 171 lba = bio->bio_offset / ns->lba_size; 172 lba_count = bio->bio_bcount / ns->lba_size; 173 nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); 174 break; 175 case BIO_WRITE: 176 lba = bio->bio_offset / ns->lba_size; 177 lba_count = bio->bio_bcount / ns->lba_size; 178 nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); 179 break; 180 case BIO_FLUSH: 181 nvme_ns_flush_cmd(&cmd, ns->id); 182 break; 183 case BIO_DELETE: 184 dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | 185 M_ZERO); 186 if (dsm_range == NULL) 187 return (ENOMEM); 188 lba = bio->bio_offset / ns->lba_size; 189 lba_count = bio->bio_bcount / ns->lba_size; 190 dsm_range->starting_lba = htole64(lba); 191 dsm_range->length = htole32(lba_count); 192 193 cmd.opc = NVME_OPC_DATASET_MANAGEMENT; 194 cmd.nsid = htole32(ns->id); 195 cmd.cdw10 = htole32(0); /* 1 range */ 196 cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); 197 break; 198 default: 199 return (EOPNOTSUPP); 200 } 201 202 mtx_lock(&ns->lock); 203 if (ns->disconnected) { 204 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); 205 mtx_unlock(&ns->lock); 206 free(dsm_range, M_NVMF); 207 return (0); 208 } 209 210 req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, 211 nvmf_ns_bio_complete, bio, M_NOWAIT); 212 if (req == NULL) { 213 mtx_unlock(&ns->lock); 214 free(dsm_range, M_NVMF); 215 return (ENOMEM); 216 } 217 218 switch (bio->bio_cmd) { 219 case BIO_READ: 220 case BIO_WRITE: 221 refcount_init(bio_refs(bio), 2); 222 mem = memdesc_bio(bio); 223 nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, 224 bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); 225 break; 226 case BIO_DELETE: 227 refcount_init(bio_refs(bio), 2); 228 mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); 229 nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), 230 true, nvmf_ns_delete_complete, bio); 231 bio->bio_driver2 = dsm_range; 232 break; 233 default: 234 refcount_init(bio_refs(bio), 1); 235 KASSERT(bio->bio_resid == 0, 236 ("%s: input bio_resid != 0", __func__)); 237 break; 238 } 239 240 refcount_acquire(&ns->active_bios); 241 nvmf_submit_request(req); 242 mtx_unlock(&ns->lock); 243 return (0); 244 } 245 246 static int 247 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, 248 struct thread *td) 249 { 250 struct nvmf_namespace *ns = dev->si_drv1; 251 struct nvme_get_nsid *gnsid; 252 struct nvme_pt_command *pt; 253 254 switch (cmd) { 255 case NVME_PASSTHROUGH_CMD: 256 pt = (struct nvme_pt_command *)arg; 257 pt->cmd.nsid = htole32(ns->id); 258 return (nvmf_passthrough_cmd(ns->sc, pt, false)); 259 case NVME_GET_NSID: 260 gnsid = (struct nvme_get_nsid *)arg; 261 strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), 262 sizeof(gnsid->cdev)); 263 gnsid->nsid = ns->id; 264 return (0); 265 case DIOCGMEDIASIZE: 266 *(off_t *)arg = ns->size; 267 return (0); 268 case DIOCGSECTORSIZE: 269 *(u_int *)arg = ns->lba_size; 270 return (0); 271 default: 272 return (ENOTTY); 273 } 274 } 275 276 static int 277 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 278 { 279 int error; 280 281 error = 0; 282 if ((oflags & FWRITE) != 0) 283 error = securelevel_gt(td->td_ucred, 0); 284 return (error); 285 } 286 287 void 288 nvmf_ns_strategy(struct bio *bio) 289 { 290 struct nvmf_namespace *ns; 291 int error; 292 293 ns = bio->bio_dev->si_drv1; 294 295 error = nvmf_ns_submit_bio(ns, bio); 296 if (error != 0) { 297 bio->bio_error = error; 298 bio->bio_flags |= BIO_ERROR; 299 bio->bio_resid = bio->bio_bcount; 300 biodone(bio); 301 } 302 } 303 304 static struct cdevsw nvmf_ns_cdevsw = { 305 .d_version = D_VERSION, 306 .d_flags = D_DISK, 307 .d_open = nvmf_ns_open, 308 .d_read = physread, 309 .d_write = physwrite, 310 .d_strategy = nvmf_ns_strategy, 311 .d_ioctl = nvmf_ns_ioctl 312 }; 313 314 struct nvmf_namespace * 315 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, 316 struct nvme_namespace_data *data) 317 { 318 struct make_dev_args mda; 319 struct nvmf_namespace *ns; 320 int error; 321 uint8_t lbads, lbaf; 322 323 ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); 324 ns->sc = sc; 325 ns->id = id; 326 TAILQ_INIT(&ns->pending_bios); 327 mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); 328 329 /* One dummy bio avoids dropping to 0 until destroy. */ 330 refcount_init(&ns->active_bios, 1); 331 332 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 333 ns_printf(ns, "End-to-end data protection not supported\n"); 334 goto fail; 335 } 336 337 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 338 if (lbaf > data->nlbaf) { 339 ns_printf(ns, "Invalid LBA format index\n"); 340 goto fail; 341 } 342 343 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 344 ns_printf(ns, "Namespaces with metadata are not supported\n"); 345 goto fail; 346 } 347 348 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 349 if (lbads == 0) { 350 ns_printf(ns, "Invalid LBA format index\n"); 351 goto fail; 352 } 353 354 ns->lba_size = 1 << lbads; 355 ns->size = data->nsze * ns->lba_size; 356 357 if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) 358 ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; 359 360 if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) 361 ns->flags |= NVME_NS_FLUSH_SUPPORTED; 362 363 /* 364 * XXX: Does any of the boundary splitting for NOIOB make any 365 * sense for Fabrics? 366 */ 367 368 make_dev_args_init(&mda); 369 mda.mda_devsw = &nvmf_ns_cdevsw; 370 mda.mda_uid = UID_ROOT; 371 mda.mda_gid = GID_WHEEL; 372 mda.mda_mode = 0600; 373 mda.mda_si_drv1 = ns; 374 error = make_dev_s(&mda, &ns->cdev, "%sns%u", 375 device_get_nameunit(sc->dev), id); 376 if (error != 0) 377 goto fail; 378 379 ns->cdev->si_flags |= SI_UNMAPPED; 380 381 return (ns); 382 fail: 383 mtx_destroy(&ns->lock); 384 free(ns, M_NVMF); 385 return (NULL); 386 } 387 388 void 389 nvmf_disconnect_ns(struct nvmf_namespace *ns) 390 { 391 mtx_lock(&ns->lock); 392 ns->disconnected = true; 393 mtx_unlock(&ns->lock); 394 } 395 396 void 397 nvmf_reconnect_ns(struct nvmf_namespace *ns) 398 { 399 TAILQ_HEAD(, bio) bios; 400 struct bio *bio; 401 402 mtx_lock(&ns->lock); 403 ns->disconnected = false; 404 TAILQ_INIT(&bios); 405 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 406 mtx_unlock(&ns->lock); 407 408 while (!TAILQ_EMPTY(&bios)) { 409 bio = TAILQ_FIRST(&bios); 410 TAILQ_REMOVE(&bios, bio, bio_queue); 411 nvmf_ns_strategy(bio); 412 } 413 } 414 415 void 416 nvmf_destroy_ns(struct nvmf_namespace *ns) 417 { 418 TAILQ_HEAD(, bio) bios; 419 struct bio *bio; 420 421 destroy_dev(ns->cdev); 422 423 /* 424 * Wait for active I/O requests to drain. The release drops 425 * the reference on the "dummy bio" when the namespace is 426 * created. 427 */ 428 mtx_lock(&ns->lock); 429 if (!refcount_release(&ns->active_bios)) { 430 while (ns->active_bios != 0) 431 mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); 432 } 433 434 /* Abort any pending I/O requests. */ 435 TAILQ_INIT(&bios); 436 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); 437 mtx_unlock(&ns->lock); 438 439 while (!TAILQ_EMPTY(&bios)) { 440 bio = TAILQ_FIRST(&bios); 441 TAILQ_REMOVE(&bios, bio, bio_queue); 442 bio->bio_error = ECONNABORTED; 443 bio->bio_flags |= BIO_ERROR; 444 bio->bio_resid = bio->bio_bcount; 445 biodone(bio); 446 } 447 448 mtx_destroy(&ns->lock); 449 free(ns, M_NVMF); 450 } 451 452 bool 453 nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) 454 { 455 uint8_t lbads, lbaf; 456 457 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { 458 ns_printf(ns, "End-to-end data protection not supported\n"); 459 return (false); 460 } 461 462 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); 463 if (lbaf > data->nlbaf) { 464 ns_printf(ns, "Invalid LBA format index\n"); 465 return (false); 466 } 467 468 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { 469 ns_printf(ns, "Namespaces with metadata are not supported\n"); 470 return (false); 471 } 472 473 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); 474 if (lbads == 0) { 475 ns_printf(ns, "Invalid LBA format index\n"); 476 return (false); 477 } 478 479 ns->lba_size = 1 << lbads; 480 ns->size = data->nsze * ns->lba_size; 481 return (true); 482 } 483