xref: /freebsd/sys/dev/nvmf/host/nvmf_ns.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bio.h>
10 #include <sys/bus.h>
11 #include <sys/conf.h>
12 #include <sys/disk.h>
13 #include <sys/fcntl.h>
14 #include <sys/lock.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/mutex.h>
18 #include <sys/proc.h>
19 #include <sys/refcount.h>
20 #include <sys/sbuf.h>
21 #include <machine/stdarg.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/host/nvmf_var.h>
24 
25 struct nvmf_namespace {
26 	struct nvmf_softc *sc;
27 	uint64_t size;
28 	uint32_t id;
29 	u_int	flags;
30 	uint32_t lba_size;
31 	bool disconnected;
32 	bool shutdown;
33 
34 	TAILQ_HEAD(, bio) pending_bios;
35 	struct mtx lock;
36 	volatile u_int active_bios;
37 
38 	struct cdev *cdev;
39 };
40 
41 static void	nvmf_ns_strategy(struct bio *bio);
42 
43 static void
44 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
45 {
46 	char buf[128];
47 	struct sbuf sb;
48 	va_list ap;
49 
50 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
51 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
52 
53 	sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
54 	    ns->id);
55 
56 	va_start(ap, fmt);
57 	sbuf_vprintf(&sb, fmt, ap);
58 	va_end(ap);
59 
60 	sbuf_finish(&sb);
61 	sbuf_delete(&sb);
62 }
63 
64 /*
65  * The I/O completion may trigger after the received CQE if the I/O
66  * used a zero-copy mbuf that isn't harvested until after the NIC
67  * driver processes TX completions.  Abuse bio_driver1 as a refcount.
68  * Store I/O errors in bio_driver2.
69  */
70 static __inline u_int *
71 bio_refs(struct bio *bio)
72 {
73 	return ((u_int *)&bio->bio_driver1);
74 }
75 
76 static void
77 nvmf_ns_biodone(struct bio *bio)
78 {
79 	struct nvmf_namespace *ns;
80 	int error;
81 
82 	if (!refcount_release(bio_refs(bio)))
83 		return;
84 
85 	ns = bio->bio_dev->si_drv1;
86 
87 	/* If a request is aborted, resubmit or queue it for resubmission. */
88 	if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
89 		bio->bio_error = 0;
90 		bio->bio_driver2 = 0;
91 		mtx_lock(&ns->lock);
92 		if (ns->disconnected) {
93 			if (nvmf_fail_disconnect || ns->shutdown) {
94 				mtx_unlock(&ns->lock);
95 				bio->bio_error = ECONNABORTED;
96 				bio->bio_flags |= BIO_ERROR;
97 				bio->bio_resid = bio->bio_bcount;
98 				biodone(bio);
99 			} else {
100 				TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
101 				    bio_queue);
102 				mtx_unlock(&ns->lock);
103 			}
104 		} else {
105 			mtx_unlock(&ns->lock);
106 			nvmf_ns_strategy(bio);
107 		}
108 	} else {
109 		/*
110 		 * I/O errors take precedence over generic EIO from
111 		 * CQE errors.
112 		 */
113 		error = (intptr_t)bio->bio_driver2;
114 		if (error != 0)
115 			bio->bio_error = error;
116 		if (bio->bio_error != 0)
117 			bio->bio_flags |= BIO_ERROR;
118 		biodone(bio);
119 	}
120 
121 	if (refcount_release(&ns->active_bios))
122 		wakeup(ns);
123 }
124 
125 static void
126 nvmf_ns_io_complete(void *arg, size_t xfered, int error)
127 {
128 	struct bio *bio = arg;
129 
130 	KASSERT(xfered <= bio->bio_bcount,
131 	    ("%s: xfered > bio_bcount", __func__));
132 
133 	bio->bio_driver2 = (void *)(intptr_t)error;
134 	bio->bio_resid = bio->bio_bcount - xfered;
135 
136 	nvmf_ns_biodone(bio);
137 }
138 
139 static void
140 nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
141 {
142 	struct bio *bio = arg;
143 
144 	if (error != 0)
145 		bio->bio_resid = bio->bio_bcount;
146 	else
147 		bio->bio_resid = 0;
148 
149 	free(bio->bio_driver2, M_NVMF);
150 	bio->bio_driver2 = (void *)(intptr_t)error;
151 
152 	nvmf_ns_biodone(bio);
153 }
154 
155 static void
156 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
157 {
158 	struct bio *bio = arg;
159 
160 	if (nvmf_cqe_aborted(cqe))
161 		bio->bio_error = ECONNABORTED;
162 	else if (cqe->status != 0)
163 		bio->bio_error = EIO;
164 
165 	nvmf_ns_biodone(bio);
166 }
167 
168 static int
169 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
170 {
171 	struct nvme_command cmd;
172 	struct nvmf_request *req;
173 	struct nvme_dsm_range *dsm_range;
174 	struct memdesc mem;
175 	uint64_t lba, lba_count;
176 	int error;
177 
178 	dsm_range = NULL;
179 	memset(&cmd, 0, sizeof(cmd));
180 	switch (bio->bio_cmd) {
181 	case BIO_READ:
182 		lba = bio->bio_offset / ns->lba_size;
183 		lba_count = bio->bio_bcount / ns->lba_size;
184 		nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
185 		break;
186 	case BIO_WRITE:
187 		lba = bio->bio_offset / ns->lba_size;
188 		lba_count = bio->bio_bcount / ns->lba_size;
189 		nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
190 		break;
191 	case BIO_FLUSH:
192 		nvme_ns_flush_cmd(&cmd, ns->id);
193 		break;
194 	case BIO_DELETE:
195 		dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
196 		    M_ZERO);
197 		if (dsm_range == NULL)
198 			return (ENOMEM);
199 		lba = bio->bio_offset / ns->lba_size;
200 		lba_count = bio->bio_bcount / ns->lba_size;
201 		dsm_range->starting_lba = htole64(lba);
202 		dsm_range->length = htole32(lba_count);
203 
204 		cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
205 		cmd.nsid = htole32(ns->id);
206 		cmd.cdw10 = htole32(0);		/* 1 range */
207 		cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
208 		break;
209 	default:
210 		return (EOPNOTSUPP);
211 	}
212 
213 	mtx_lock(&ns->lock);
214 	if (ns->disconnected) {
215 		if (nvmf_fail_disconnect || ns->shutdown) {
216 			error = ECONNABORTED;
217 		} else {
218 			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
219 			error = 0;
220 		}
221 		mtx_unlock(&ns->lock);
222 		free(dsm_range, M_NVMF);
223 		return (error);
224 	}
225 
226 	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
227 	    nvmf_ns_bio_complete, bio, M_NOWAIT);
228 	if (req == NULL) {
229 		mtx_unlock(&ns->lock);
230 		free(dsm_range, M_NVMF);
231 		return (ENOMEM);
232 	}
233 
234 	switch (bio->bio_cmd) {
235 	case BIO_READ:
236 	case BIO_WRITE:
237 		refcount_init(bio_refs(bio), 2);
238 		mem = memdesc_bio(bio);
239 		nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
240 		    bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
241 		break;
242 	case BIO_DELETE:
243 		refcount_init(bio_refs(bio), 2);
244 		mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
245 		nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
246 		    true, nvmf_ns_delete_complete, bio);
247 		bio->bio_driver2 = dsm_range;
248 		break;
249 	default:
250 		refcount_init(bio_refs(bio), 1);
251 		KASSERT(bio->bio_resid == 0,
252 		    ("%s: input bio_resid != 0", __func__));
253 		break;
254 	}
255 
256 	refcount_acquire(&ns->active_bios);
257 	nvmf_submit_request(req);
258 	mtx_unlock(&ns->lock);
259 	return (0);
260 }
261 
262 static int
263 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
264     struct thread *td)
265 {
266 	struct nvmf_namespace *ns = dev->si_drv1;
267 	struct nvme_get_nsid *gnsid;
268 	struct nvme_pt_command *pt;
269 
270 	switch (cmd) {
271 	case NVME_PASSTHROUGH_CMD:
272 		pt = (struct nvme_pt_command *)arg;
273 		pt->cmd.nsid = htole32(ns->id);
274 		return (nvmf_passthrough_cmd(ns->sc, pt, false));
275 	case NVME_GET_NSID:
276 		gnsid = (struct nvme_get_nsid *)arg;
277 		strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
278 		    sizeof(gnsid->cdev));
279 		gnsid->nsid = ns->id;
280 		return (0);
281 	case DIOCGMEDIASIZE:
282 		*(off_t *)arg = ns->size;
283 		return (0);
284 	case DIOCGSECTORSIZE:
285 		*(u_int *)arg = ns->lba_size;
286 		return (0);
287 	default:
288 		return (ENOTTY);
289 	}
290 }
291 
292 static int
293 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
294 {
295 	int error;
296 
297 	error = 0;
298 	if ((oflags & FWRITE) != 0)
299 		error = securelevel_gt(td->td_ucred, 0);
300 	return (error);
301 }
302 
303 void
304 nvmf_ns_strategy(struct bio *bio)
305 {
306 	struct nvmf_namespace *ns;
307 	int error;
308 
309 	ns = bio->bio_dev->si_drv1;
310 
311 	error = nvmf_ns_submit_bio(ns, bio);
312 	if (error != 0) {
313 		bio->bio_error = error;
314 		bio->bio_flags |= BIO_ERROR;
315 		bio->bio_resid = bio->bio_bcount;
316 		biodone(bio);
317 	}
318 }
319 
320 static struct cdevsw nvmf_ns_cdevsw = {
321 	.d_version = D_VERSION,
322 	.d_flags = D_DISK,
323 	.d_open = nvmf_ns_open,
324 	.d_read = physread,
325 	.d_write = physwrite,
326 	.d_strategy = nvmf_ns_strategy,
327 	.d_ioctl = nvmf_ns_ioctl
328 };
329 
330 struct nvmf_namespace *
331 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
332     const struct nvme_namespace_data *data)
333 {
334 	struct make_dev_args mda;
335 	struct nvmf_namespace *ns;
336 	int error;
337 	uint8_t lbads, lbaf;
338 
339 	ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
340 	ns->sc = sc;
341 	ns->id = id;
342 	TAILQ_INIT(&ns->pending_bios);
343 	mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
344 
345 	/* One dummy bio avoids dropping to 0 until destroy. */
346 	refcount_init(&ns->active_bios, 1);
347 
348 	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
349 		ns_printf(ns, "End-to-end data protection not supported\n");
350 		goto fail;
351 	}
352 
353 	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
354 	if (lbaf > data->nlbaf) {
355 		ns_printf(ns, "Invalid LBA format index\n");
356 		goto fail;
357 	}
358 
359 	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
360 		ns_printf(ns, "Namespaces with metadata are not supported\n");
361 		goto fail;
362 	}
363 
364 	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
365 	if (lbads == 0) {
366 		ns_printf(ns, "Invalid LBA format index\n");
367 		goto fail;
368 	}
369 
370 	ns->lba_size = 1 << lbads;
371 	ns->size = data->nsze * ns->lba_size;
372 
373 	if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
374 		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
375 
376 	if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
377 		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
378 
379 	/*
380 	 * XXX: Does any of the boundary splitting for NOIOB make any
381 	 * sense for Fabrics?
382 	 */
383 
384 	make_dev_args_init(&mda);
385 	mda.mda_devsw = &nvmf_ns_cdevsw;
386 	mda.mda_uid = UID_ROOT;
387 	mda.mda_gid = GID_WHEEL;
388 	mda.mda_mode = 0600;
389 	mda.mda_si_drv1 = ns;
390 	error = make_dev_s(&mda, &ns->cdev, "%sn%u",
391 	    device_get_nameunit(sc->dev), id);
392 	if (error != 0)
393 		goto fail;
394 	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
395 	    device_get_nameunit(sc->dev), id);
396 
397 	ns->cdev->si_flags |= SI_UNMAPPED;
398 
399 	return (ns);
400 fail:
401 	mtx_destroy(&ns->lock);
402 	free(ns, M_NVMF);
403 	return (NULL);
404 }
405 
406 void
407 nvmf_disconnect_ns(struct nvmf_namespace *ns)
408 {
409 	mtx_lock(&ns->lock);
410 	ns->disconnected = true;
411 	mtx_unlock(&ns->lock);
412 }
413 
414 void
415 nvmf_reconnect_ns(struct nvmf_namespace *ns)
416 {
417 	TAILQ_HEAD(, bio) bios;
418 	struct bio *bio;
419 
420 	mtx_lock(&ns->lock);
421 	ns->disconnected = false;
422 	TAILQ_INIT(&bios);
423 	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
424 	mtx_unlock(&ns->lock);
425 
426 	while (!TAILQ_EMPTY(&bios)) {
427 		bio = TAILQ_FIRST(&bios);
428 		TAILQ_REMOVE(&bios, bio, bio_queue);
429 		nvmf_ns_strategy(bio);
430 	}
431 }
432 
433 void
434 nvmf_shutdown_ns(struct nvmf_namespace *ns)
435 {
436 	TAILQ_HEAD(, bio) bios;
437 	struct bio *bio;
438 
439 	mtx_lock(&ns->lock);
440 	ns->shutdown = true;
441 	TAILQ_INIT(&bios);
442 	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
443 	mtx_unlock(&ns->lock);
444 
445 	while (!TAILQ_EMPTY(&bios)) {
446 		bio = TAILQ_FIRST(&bios);
447 		TAILQ_REMOVE(&bios, bio, bio_queue);
448 		bio->bio_error = ECONNABORTED;
449 		bio->bio_flags |= BIO_ERROR;
450 		bio->bio_resid = bio->bio_bcount;
451 		biodone(bio);
452 	}
453 }
454 
455 void
456 nvmf_destroy_ns(struct nvmf_namespace *ns)
457 {
458 	TAILQ_HEAD(, bio) bios;
459 	struct bio *bio;
460 
461 	if (ns->cdev->si_drv2 != NULL)
462 		destroy_dev(ns->cdev->si_drv2);
463 	destroy_dev(ns->cdev);
464 
465 	/*
466 	 * Wait for active I/O requests to drain.  The release drops
467 	 * the reference on the "dummy bio" when the namespace is
468 	 * created.
469 	 */
470 	mtx_lock(&ns->lock);
471 	if (!refcount_release(&ns->active_bios)) {
472 		while (ns->active_bios != 0)
473 			mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
474 	}
475 
476 	/* Abort any pending I/O requests. */
477 	TAILQ_INIT(&bios);
478 	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
479 	mtx_unlock(&ns->lock);
480 
481 	while (!TAILQ_EMPTY(&bios)) {
482 		bio = TAILQ_FIRST(&bios);
483 		TAILQ_REMOVE(&bios, bio, bio_queue);
484 		bio->bio_error = ECONNABORTED;
485 		bio->bio_flags |= BIO_ERROR;
486 		bio->bio_resid = bio->bio_bcount;
487 		biodone(bio);
488 	}
489 
490 	mtx_destroy(&ns->lock);
491 	free(ns, M_NVMF);
492 }
493 
494 bool
495 nvmf_update_ns(struct nvmf_namespace *ns,
496     const struct nvme_namespace_data *data)
497 {
498 	uint8_t lbads, lbaf;
499 
500 	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
501 		ns_printf(ns, "End-to-end data protection not supported\n");
502 		return (false);
503 	}
504 
505 	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
506 	if (lbaf > data->nlbaf) {
507 		ns_printf(ns, "Invalid LBA format index\n");
508 		return (false);
509 	}
510 
511 	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
512 		ns_printf(ns, "Namespaces with metadata are not supported\n");
513 		return (false);
514 	}
515 
516 	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
517 	if (lbads == 0) {
518 		ns_printf(ns, "Invalid LBA format index\n");
519 		return (false);
520 	}
521 
522 	ns->lba_size = 1 << lbads;
523 	ns->size = data->nsze * ns->lba_size;
524 	return (true);
525 }
526