1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 */
7
8 #include <sys/param.h>
9 #include <sys/bio.h>
10 #include <sys/bus.h>
11 #include <sys/conf.h>
12 #include <sys/disk.h>
13 #include <sys/fcntl.h>
14 #include <sys/lock.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/mutex.h>
18 #include <sys/proc.h>
19 #include <sys/refcount.h>
20 #include <sys/sbuf.h>
21 #include <machine/stdarg.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/host/nvmf_var.h>
24
25 struct nvmf_namespace {
26 struct nvmf_softc *sc;
27 uint64_t size;
28 uint32_t id;
29 u_int flags;
30 uint32_t lba_size;
31 bool disconnected;
32 bool shutdown;
33
34 TAILQ_HEAD(, bio) pending_bios;
35 struct mtx lock;
36 volatile u_int active_bios;
37
38 struct cdev *cdev;
39 };
40
41 static void nvmf_ns_strategy(struct bio *bio);
42
43 static void
ns_printf(struct nvmf_namespace * ns,const char * fmt,...)44 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
45 {
46 char buf[128];
47 struct sbuf sb;
48 va_list ap;
49
50 sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
51 sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
52
53 sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
54 ns->id);
55
56 va_start(ap, fmt);
57 sbuf_vprintf(&sb, fmt, ap);
58 va_end(ap);
59
60 sbuf_finish(&sb);
61 sbuf_delete(&sb);
62 }
63
64 /*
65 * The I/O completion may trigger after the received CQE if the I/O
66 * used a zero-copy mbuf that isn't harvested until after the NIC
67 * driver processes TX completions. Abuse bio_driver1 as a refcount.
68 * Store I/O errors in bio_driver2.
69 */
70 static __inline u_int *
bio_refs(struct bio * bio)71 bio_refs(struct bio *bio)
72 {
73 return ((u_int *)&bio->bio_driver1);
74 }
75
76 static void
nvmf_ns_biodone(struct bio * bio)77 nvmf_ns_biodone(struct bio *bio)
78 {
79 struct nvmf_namespace *ns;
80 int error;
81
82 if (!refcount_release(bio_refs(bio)))
83 return;
84
85 ns = bio->bio_dev->si_drv1;
86
87 /* If a request is aborted, resubmit or queue it for resubmission. */
88 if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
89 bio->bio_error = 0;
90 bio->bio_driver2 = 0;
91 mtx_lock(&ns->lock);
92 if (ns->disconnected) {
93 if (nvmf_fail_disconnect || ns->shutdown) {
94 mtx_unlock(&ns->lock);
95 bio->bio_error = ECONNABORTED;
96 bio->bio_flags |= BIO_ERROR;
97 bio->bio_resid = bio->bio_bcount;
98 biodone(bio);
99 } else {
100 TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
101 bio_queue);
102 mtx_unlock(&ns->lock);
103 }
104 } else {
105 mtx_unlock(&ns->lock);
106 nvmf_ns_strategy(bio);
107 }
108 } else {
109 /*
110 * I/O errors take precedence over generic EIO from
111 * CQE errors.
112 */
113 error = (intptr_t)bio->bio_driver2;
114 if (error != 0)
115 bio->bio_error = error;
116 if (bio->bio_error != 0)
117 bio->bio_flags |= BIO_ERROR;
118 biodone(bio);
119 }
120
121 if (refcount_release(&ns->active_bios))
122 wakeup(ns);
123 }
124
125 static void
nvmf_ns_io_complete(void * arg,size_t xfered,int error)126 nvmf_ns_io_complete(void *arg, size_t xfered, int error)
127 {
128 struct bio *bio = arg;
129
130 KASSERT(xfered <= bio->bio_bcount,
131 ("%s: xfered > bio_bcount", __func__));
132
133 bio->bio_driver2 = (void *)(intptr_t)error;
134 bio->bio_resid = bio->bio_bcount - xfered;
135
136 nvmf_ns_biodone(bio);
137 }
138
139 static void
nvmf_ns_delete_complete(void * arg,size_t xfered,int error)140 nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
141 {
142 struct bio *bio = arg;
143
144 if (error != 0)
145 bio->bio_resid = bio->bio_bcount;
146 else
147 bio->bio_resid = 0;
148
149 free(bio->bio_driver2, M_NVMF);
150 bio->bio_driver2 = (void *)(intptr_t)error;
151
152 nvmf_ns_biodone(bio);
153 }
154
155 static void
nvmf_ns_bio_complete(void * arg,const struct nvme_completion * cqe)156 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
157 {
158 struct bio *bio = arg;
159
160 if (nvmf_cqe_aborted(cqe))
161 bio->bio_error = ECONNABORTED;
162 else if (cqe->status != 0)
163 bio->bio_error = EIO;
164
165 nvmf_ns_biodone(bio);
166 }
167
168 static int
nvmf_ns_submit_bio(struct nvmf_namespace * ns,struct bio * bio)169 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
170 {
171 struct nvme_command cmd;
172 struct nvmf_request *req;
173 struct nvme_dsm_range *dsm_range;
174 struct memdesc mem;
175 uint64_t lba, lba_count;
176 int error;
177
178 dsm_range = NULL;
179 memset(&cmd, 0, sizeof(cmd));
180 switch (bio->bio_cmd) {
181 case BIO_READ:
182 lba = bio->bio_offset / ns->lba_size;
183 lba_count = bio->bio_bcount / ns->lba_size;
184 nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
185 break;
186 case BIO_WRITE:
187 lba = bio->bio_offset / ns->lba_size;
188 lba_count = bio->bio_bcount / ns->lba_size;
189 nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
190 break;
191 case BIO_FLUSH:
192 nvme_ns_flush_cmd(&cmd, ns->id);
193 break;
194 case BIO_DELETE:
195 dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
196 M_ZERO);
197 if (dsm_range == NULL)
198 return (ENOMEM);
199 lba = bio->bio_offset / ns->lba_size;
200 lba_count = bio->bio_bcount / ns->lba_size;
201 dsm_range->starting_lba = htole64(lba);
202 dsm_range->length = htole32(lba_count);
203
204 cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
205 cmd.nsid = htole32(ns->id);
206 cmd.cdw10 = htole32(0); /* 1 range */
207 cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
208 break;
209 default:
210 return (EOPNOTSUPP);
211 }
212
213 mtx_lock(&ns->lock);
214 if (ns->disconnected) {
215 if (nvmf_fail_disconnect || ns->shutdown) {
216 error = ECONNABORTED;
217 } else {
218 TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
219 error = 0;
220 }
221 mtx_unlock(&ns->lock);
222 free(dsm_range, M_NVMF);
223 return (error);
224 }
225
226 req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
227 nvmf_ns_bio_complete, bio, M_NOWAIT);
228 if (req == NULL) {
229 mtx_unlock(&ns->lock);
230 free(dsm_range, M_NVMF);
231 return (ENOMEM);
232 }
233
234 switch (bio->bio_cmd) {
235 case BIO_READ:
236 case BIO_WRITE:
237 refcount_init(bio_refs(bio), 2);
238 mem = memdesc_bio(bio);
239 nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
240 bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
241 break;
242 case BIO_DELETE:
243 refcount_init(bio_refs(bio), 2);
244 mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
245 nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
246 true, nvmf_ns_delete_complete, bio);
247 bio->bio_driver2 = dsm_range;
248 break;
249 default:
250 refcount_init(bio_refs(bio), 1);
251 KASSERT(bio->bio_resid == 0,
252 ("%s: input bio_resid != 0", __func__));
253 break;
254 }
255
256 refcount_acquire(&ns->active_bios);
257 nvmf_submit_request(req);
258 mtx_unlock(&ns->lock);
259 return (0);
260 }
261
262 static int
nvmf_ns_ioctl(struct cdev * dev,u_long cmd,caddr_t arg,int flag,struct thread * td)263 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
264 struct thread *td)
265 {
266 struct nvmf_namespace *ns = dev->si_drv1;
267 struct nvme_get_nsid *gnsid;
268 struct nvme_pt_command *pt;
269
270 switch (cmd) {
271 case NVME_PASSTHROUGH_CMD:
272 pt = (struct nvme_pt_command *)arg;
273 pt->cmd.nsid = htole32(ns->id);
274 return (nvmf_passthrough_cmd(ns->sc, pt, false));
275 case NVME_GET_NSID:
276 gnsid = (struct nvme_get_nsid *)arg;
277 strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
278 sizeof(gnsid->cdev));
279 gnsid->nsid = ns->id;
280 return (0);
281 case DIOCGMEDIASIZE:
282 *(off_t *)arg = ns->size;
283 return (0);
284 case DIOCGSECTORSIZE:
285 *(u_int *)arg = ns->lba_size;
286 return (0);
287 default:
288 return (ENOTTY);
289 }
290 }
291
292 static int
nvmf_ns_open(struct cdev * dev,int oflags,int devtype,struct thread * td)293 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
294 {
295 int error;
296
297 error = 0;
298 if ((oflags & FWRITE) != 0)
299 error = securelevel_gt(td->td_ucred, 0);
300 return (error);
301 }
302
303 void
nvmf_ns_strategy(struct bio * bio)304 nvmf_ns_strategy(struct bio *bio)
305 {
306 struct nvmf_namespace *ns;
307 int error;
308
309 ns = bio->bio_dev->si_drv1;
310
311 error = nvmf_ns_submit_bio(ns, bio);
312 if (error != 0) {
313 bio->bio_error = error;
314 bio->bio_flags |= BIO_ERROR;
315 bio->bio_resid = bio->bio_bcount;
316 biodone(bio);
317 }
318 }
319
320 static struct cdevsw nvmf_ns_cdevsw = {
321 .d_version = D_VERSION,
322 .d_flags = D_DISK,
323 .d_open = nvmf_ns_open,
324 .d_read = physread,
325 .d_write = physwrite,
326 .d_strategy = nvmf_ns_strategy,
327 .d_ioctl = nvmf_ns_ioctl
328 };
329
330 struct nvmf_namespace *
nvmf_init_ns(struct nvmf_softc * sc,uint32_t id,const struct nvme_namespace_data * data)331 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
332 const struct nvme_namespace_data *data)
333 {
334 struct make_dev_args mda;
335 struct nvmf_namespace *ns;
336 int error;
337 uint8_t lbads, lbaf;
338
339 ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
340 ns->sc = sc;
341 ns->id = id;
342 TAILQ_INIT(&ns->pending_bios);
343 mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
344
345 /* One dummy bio avoids dropping to 0 until destroy. */
346 refcount_init(&ns->active_bios, 1);
347
348 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
349 ns_printf(ns, "End-to-end data protection not supported\n");
350 goto fail;
351 }
352
353 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
354 if (lbaf > data->nlbaf) {
355 ns_printf(ns, "Invalid LBA format index\n");
356 goto fail;
357 }
358
359 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
360 ns_printf(ns, "Namespaces with metadata are not supported\n");
361 goto fail;
362 }
363
364 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
365 if (lbads == 0) {
366 ns_printf(ns, "Invalid LBA format index\n");
367 goto fail;
368 }
369
370 ns->lba_size = 1 << lbads;
371 ns->size = data->nsze * ns->lba_size;
372
373 if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
374 ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
375
376 if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
377 ns->flags |= NVME_NS_FLUSH_SUPPORTED;
378
379 /*
380 * XXX: Does any of the boundary splitting for NOIOB make any
381 * sense for Fabrics?
382 */
383
384 make_dev_args_init(&mda);
385 mda.mda_devsw = &nvmf_ns_cdevsw;
386 mda.mda_uid = UID_ROOT;
387 mda.mda_gid = GID_WHEEL;
388 mda.mda_mode = 0600;
389 mda.mda_si_drv1 = ns;
390 error = make_dev_s(&mda, &ns->cdev, "%sn%u",
391 device_get_nameunit(sc->dev), id);
392 if (error != 0)
393 goto fail;
394 ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
395 device_get_nameunit(sc->dev), id);
396
397 ns->cdev->si_flags |= SI_UNMAPPED;
398
399 return (ns);
400 fail:
401 mtx_destroy(&ns->lock);
402 free(ns, M_NVMF);
403 return (NULL);
404 }
405
406 void
nvmf_disconnect_ns(struct nvmf_namespace * ns)407 nvmf_disconnect_ns(struct nvmf_namespace *ns)
408 {
409 mtx_lock(&ns->lock);
410 ns->disconnected = true;
411 mtx_unlock(&ns->lock);
412 }
413
414 void
nvmf_reconnect_ns(struct nvmf_namespace * ns)415 nvmf_reconnect_ns(struct nvmf_namespace *ns)
416 {
417 TAILQ_HEAD(, bio) bios;
418 struct bio *bio;
419
420 mtx_lock(&ns->lock);
421 ns->disconnected = false;
422 TAILQ_INIT(&bios);
423 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
424 mtx_unlock(&ns->lock);
425
426 while (!TAILQ_EMPTY(&bios)) {
427 bio = TAILQ_FIRST(&bios);
428 TAILQ_REMOVE(&bios, bio, bio_queue);
429 nvmf_ns_strategy(bio);
430 }
431 }
432
433 void
nvmf_shutdown_ns(struct nvmf_namespace * ns)434 nvmf_shutdown_ns(struct nvmf_namespace *ns)
435 {
436 TAILQ_HEAD(, bio) bios;
437 struct bio *bio;
438
439 mtx_lock(&ns->lock);
440 ns->shutdown = true;
441 TAILQ_INIT(&bios);
442 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
443 mtx_unlock(&ns->lock);
444
445 while (!TAILQ_EMPTY(&bios)) {
446 bio = TAILQ_FIRST(&bios);
447 TAILQ_REMOVE(&bios, bio, bio_queue);
448 bio->bio_error = ECONNABORTED;
449 bio->bio_flags |= BIO_ERROR;
450 bio->bio_resid = bio->bio_bcount;
451 biodone(bio);
452 }
453 }
454
455 void
nvmf_destroy_ns(struct nvmf_namespace * ns)456 nvmf_destroy_ns(struct nvmf_namespace *ns)
457 {
458 TAILQ_HEAD(, bio) bios;
459 struct bio *bio;
460
461 if (ns->cdev->si_drv2 != NULL)
462 destroy_dev(ns->cdev->si_drv2);
463 destroy_dev(ns->cdev);
464
465 /*
466 * Wait for active I/O requests to drain. The release drops
467 * the reference on the "dummy bio" when the namespace is
468 * created.
469 */
470 mtx_lock(&ns->lock);
471 if (!refcount_release(&ns->active_bios)) {
472 while (ns->active_bios != 0)
473 mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
474 }
475
476 /* Abort any pending I/O requests. */
477 TAILQ_INIT(&bios);
478 TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
479 mtx_unlock(&ns->lock);
480
481 while (!TAILQ_EMPTY(&bios)) {
482 bio = TAILQ_FIRST(&bios);
483 TAILQ_REMOVE(&bios, bio, bio_queue);
484 bio->bio_error = ECONNABORTED;
485 bio->bio_flags |= BIO_ERROR;
486 bio->bio_resid = bio->bio_bcount;
487 biodone(bio);
488 }
489
490 mtx_destroy(&ns->lock);
491 free(ns, M_NVMF);
492 }
493
494 bool
nvmf_update_ns(struct nvmf_namespace * ns,const struct nvme_namespace_data * data)495 nvmf_update_ns(struct nvmf_namespace *ns,
496 const struct nvme_namespace_data *data)
497 {
498 uint8_t lbads, lbaf;
499
500 if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
501 ns_printf(ns, "End-to-end data protection not supported\n");
502 return (false);
503 }
504
505 lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
506 if (lbaf > data->nlbaf) {
507 ns_printf(ns, "Invalid LBA format index\n");
508 return (false);
509 }
510
511 if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
512 ns_printf(ns, "Namespaces with metadata are not supported\n");
513 return (false);
514 }
515
516 lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
517 if (lbads == 0) {
518 ns_printf(ns, "Invalid LBA format index\n");
519 return (false);
520 }
521
522 ns->lba_size = 1 << lbads;
523 ns->size = data->nsze * ns->lba_size;
524 return (true);
525 }
526