xref: /freebsd/sys/dev/nvmf/host/nvmf_ns.c (revision 6580f5c38dd5b01aeeaed16b370f1a12423437f0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023-2024 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  */
7 
8 #include <sys/param.h>
9 #include <sys/bio.h>
10 #include <sys/bus.h>
11 #include <sys/conf.h>
12 #include <sys/disk.h>
13 #include <sys/fcntl.h>
14 #include <sys/lock.h>
15 #include <sys/malloc.h>
16 #include <sys/memdesc.h>
17 #include <sys/mutex.h>
18 #include <sys/proc.h>
19 #include <sys/refcount.h>
20 #include <sys/sbuf.h>
21 #include <machine/stdarg.h>
22 #include <dev/nvme/nvme.h>
23 #include <dev/nvmf/host/nvmf_var.h>
24 
25 struct nvmf_namespace {
26 	struct nvmf_softc *sc;
27 	uint64_t size;
28 	uint32_t id;
29 	u_int	flags;
30 	uint32_t lba_size;
31 	bool disconnected;
32 
33 	TAILQ_HEAD(, bio) pending_bios;
34 	struct mtx lock;
35 	volatile u_int active_bios;
36 
37 	struct cdev *cdev;
38 };
39 
40 static void	nvmf_ns_strategy(struct bio *bio);
41 
42 static void
43 ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
44 {
45 	char buf[128];
46 	struct sbuf sb;
47 	va_list ap;
48 
49 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
50 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
51 
52 	sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
53 	    ns->id);
54 
55 	va_start(ap, fmt);
56 	sbuf_vprintf(&sb, fmt, ap);
57 	va_end(ap);
58 
59 	sbuf_finish(&sb);
60 	sbuf_delete(&sb);
61 }
62 
63 /*
64  * The I/O completion may trigger after the received CQE if the I/O
65  * used a zero-copy mbuf that isn't harvested until after the NIC
66  * driver processes TX completions.  Abuse bio_driver1 as a refcount.
67  * Store I/O errors in bio_driver2.
68  */
69 static __inline u_int *
70 bio_refs(struct bio *bio)
71 {
72 	return ((u_int *)&bio->bio_driver1);
73 }
74 
75 static void
76 nvmf_ns_biodone(struct bio *bio)
77 {
78 	struct nvmf_namespace *ns;
79 	int error;
80 
81 	if (!refcount_release(bio_refs(bio)))
82 		return;
83 
84 	ns = bio->bio_dev->si_drv1;
85 
86 	/* If a request is aborted, resubmit or queue it for resubmission. */
87 	if (bio->bio_error == ECONNABORTED) {
88 		bio->bio_error = 0;
89 		bio->bio_driver2 = 0;
90 		mtx_lock(&ns->lock);
91 		if (ns->disconnected) {
92 			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
93 			mtx_unlock(&ns->lock);
94 		} else {
95 			mtx_unlock(&ns->lock);
96 			nvmf_ns_strategy(bio);
97 		}
98 	} else {
99 		/*
100 		 * I/O errors take precedence over generic EIO from
101 		 * CQE errors.
102 		 */
103 		error = (intptr_t)bio->bio_driver2;
104 		if (error != 0)
105 			bio->bio_error = error;
106 		if (bio->bio_error != 0)
107 			bio->bio_flags |= BIO_ERROR;
108 		biodone(bio);
109 	}
110 
111 	if (refcount_release(&ns->active_bios))
112 		wakeup(ns);
113 }
114 
115 static void
116 nvmf_ns_io_complete(void *arg, size_t xfered, int error)
117 {
118 	struct bio *bio = arg;
119 
120 	KASSERT(xfered <= bio->bio_bcount,
121 	    ("%s: xfered > bio_bcount", __func__));
122 
123 	bio->bio_driver2 = (void *)(intptr_t)error;
124 	bio->bio_resid = bio->bio_bcount - xfered;
125 
126 	nvmf_ns_biodone(bio);
127 }
128 
129 static void
130 nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
131 {
132 	struct bio *bio = arg;
133 
134 	if (error != 0)
135 		bio->bio_resid = bio->bio_bcount;
136 	else
137 		bio->bio_resid = 0;
138 
139 	free(bio->bio_driver2, M_NVMF);
140 	bio->bio_driver2 = (void *)(intptr_t)error;
141 
142 	nvmf_ns_biodone(bio);
143 }
144 
145 static void
146 nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
147 {
148 	struct bio *bio = arg;
149 
150 	if (nvmf_cqe_aborted(cqe))
151 		bio->bio_error = ECONNABORTED;
152 	else if (cqe->status != 0)
153 		bio->bio_error = EIO;
154 
155 	nvmf_ns_biodone(bio);
156 }
157 
158 static int
159 nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
160 {
161 	struct nvme_command cmd;
162 	struct nvmf_request *req;
163 	struct nvme_dsm_range *dsm_range;
164 	struct memdesc mem;
165 	uint64_t lba, lba_count;
166 
167 	dsm_range = NULL;
168 	memset(&cmd, 0, sizeof(cmd));
169 	switch (bio->bio_cmd) {
170 	case BIO_READ:
171 		lba = bio->bio_offset / ns->lba_size;
172 		lba_count = bio->bio_bcount / ns->lba_size;
173 		nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
174 		break;
175 	case BIO_WRITE:
176 		lba = bio->bio_offset / ns->lba_size;
177 		lba_count = bio->bio_bcount / ns->lba_size;
178 		nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
179 		break;
180 	case BIO_FLUSH:
181 		nvme_ns_flush_cmd(&cmd, ns->id);
182 		break;
183 	case BIO_DELETE:
184 		dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
185 		    M_ZERO);
186 		if (dsm_range == NULL)
187 			return (ENOMEM);
188 		lba = bio->bio_offset / ns->lba_size;
189 		lba_count = bio->bio_bcount / ns->lba_size;
190 		dsm_range->starting_lba = htole64(lba);
191 		dsm_range->length = htole32(lba_count);
192 
193 		cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
194 		cmd.nsid = htole32(ns->id);
195 		cmd.cdw10 = htole32(0);		/* 1 range */
196 		cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
197 		break;
198 	default:
199 		return (EOPNOTSUPP);
200 	}
201 
202 	mtx_lock(&ns->lock);
203 	if (ns->disconnected) {
204 		TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
205 		mtx_unlock(&ns->lock);
206 		free(dsm_range, M_NVMF);
207 		return (0);
208 	}
209 
210 	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
211 	    nvmf_ns_bio_complete, bio, M_NOWAIT);
212 	if (req == NULL) {
213 		mtx_unlock(&ns->lock);
214 		free(dsm_range, M_NVMF);
215 		return (ENOMEM);
216 	}
217 
218 	switch (bio->bio_cmd) {
219 	case BIO_READ:
220 	case BIO_WRITE:
221 		refcount_init(bio_refs(bio), 2);
222 		mem = memdesc_bio(bio);
223 		nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
224 		    bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
225 		break;
226 	case BIO_DELETE:
227 		refcount_init(bio_refs(bio), 2);
228 		mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
229 		nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
230 		    true, nvmf_ns_delete_complete, bio);
231 		bio->bio_driver2 = dsm_range;
232 		break;
233 	default:
234 		refcount_init(bio_refs(bio), 1);
235 		KASSERT(bio->bio_resid == 0,
236 		    ("%s: input bio_resid != 0", __func__));
237 		break;
238 	}
239 
240 	refcount_acquire(&ns->active_bios);
241 	nvmf_submit_request(req);
242 	mtx_unlock(&ns->lock);
243 	return (0);
244 }
245 
246 static int
247 nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
248     struct thread *td)
249 {
250 	struct nvmf_namespace *ns = dev->si_drv1;
251 	struct nvme_get_nsid *gnsid;
252 	struct nvme_pt_command *pt;
253 
254 	switch (cmd) {
255 	case NVME_PASSTHROUGH_CMD:
256 		pt = (struct nvme_pt_command *)arg;
257 		pt->cmd.nsid = htole32(ns->id);
258 		return (nvmf_passthrough_cmd(ns->sc, pt, false));
259 	case NVME_GET_NSID:
260 		gnsid = (struct nvme_get_nsid *)arg;
261 		strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
262 		    sizeof(gnsid->cdev));
263 		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
264 		gnsid->nsid = ns->id;
265 		return (0);
266 	case DIOCGMEDIASIZE:
267 		*(off_t *)arg = ns->size;
268 		return (0);
269 	case DIOCGSECTORSIZE:
270 		*(u_int *)arg = ns->lba_size;
271 		return (0);
272 	default:
273 		return (ENOTTY);
274 	}
275 }
276 
277 static int
278 nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
279 {
280 	int error;
281 
282 	error = 0;
283 	if ((oflags & FWRITE) != 0)
284 		error = securelevel_gt(td->td_ucred, 0);
285 	return (error);
286 }
287 
288 void
289 nvmf_ns_strategy(struct bio *bio)
290 {
291 	struct nvmf_namespace *ns;
292 	int error;
293 
294 	ns = bio->bio_dev->si_drv1;
295 
296 	error = nvmf_ns_submit_bio(ns, bio);
297 	if (error != 0) {
298 		bio->bio_error = error;
299 		bio->bio_flags |= BIO_ERROR;
300 		bio->bio_resid = bio->bio_bcount;
301 		biodone(bio);
302 	}
303 }
304 
305 static struct cdevsw nvmf_ns_cdevsw = {
306 	.d_version = D_VERSION,
307 	.d_flags = D_DISK,
308 	.d_open = nvmf_ns_open,
309 	.d_read = physread,
310 	.d_write = physwrite,
311 	.d_strategy = nvmf_ns_strategy,
312 	.d_ioctl = nvmf_ns_ioctl
313 };
314 
315 struct nvmf_namespace *
316 nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
317     struct nvme_namespace_data *data)
318 {
319 	struct make_dev_args mda;
320 	struct nvmf_namespace *ns;
321 	int error;
322 	uint8_t lbads, lbaf;
323 
324 	ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
325 	ns->sc = sc;
326 	ns->id = id;
327 	TAILQ_INIT(&ns->pending_bios);
328 	mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
329 
330 	/* One dummy bio avoids dropping to 0 until destroy. */
331 	refcount_init(&ns->active_bios, 1);
332 
333 	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
334 		ns_printf(ns, "End-to-end data protection not supported\n");
335 		goto fail;
336 	}
337 
338 	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
339 	if (lbaf > data->nlbaf) {
340 		ns_printf(ns, "Invalid LBA format index\n");
341 		goto fail;
342 	}
343 
344 	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
345 		ns_printf(ns, "Namespaces with metadata are not supported\n");
346 		goto fail;
347 	}
348 
349 	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
350 	if (lbads == 0) {
351 		ns_printf(ns, "Invalid LBA format index\n");
352 		goto fail;
353 	}
354 
355 	ns->lba_size = 1 << lbads;
356 	ns->size = data->nsze * ns->lba_size;
357 
358 	if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
359 		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
360 
361 	if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
362 		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
363 
364 	/*
365 	 * XXX: Does any of the boundary splitting for NOIOB make any
366 	 * sense for Fabrics?
367 	 */
368 
369 	make_dev_args_init(&mda);
370 	mda.mda_devsw = &nvmf_ns_cdevsw;
371 	mda.mda_uid = UID_ROOT;
372 	mda.mda_gid = GID_WHEEL;
373 	mda.mda_mode = 0600;
374 	mda.mda_si_drv1 = ns;
375 	error = make_dev_s(&mda, &ns->cdev, "%sns%u",
376 	    device_get_nameunit(sc->dev), id);
377 	if (error != 0)
378 		goto fail;
379 
380 	ns->cdev->si_flags |= SI_UNMAPPED;
381 
382 	return (ns);
383 fail:
384 	mtx_destroy(&ns->lock);
385 	free(ns, M_NVMF);
386 	return (NULL);
387 }
388 
389 void
390 nvmf_disconnect_ns(struct nvmf_namespace *ns)
391 {
392 	mtx_lock(&ns->lock);
393 	ns->disconnected = true;
394 	mtx_unlock(&ns->lock);
395 }
396 
397 void
398 nvmf_reconnect_ns(struct nvmf_namespace *ns)
399 {
400 	TAILQ_HEAD(, bio) bios;
401 	struct bio *bio;
402 
403 	mtx_lock(&ns->lock);
404 	ns->disconnected = false;
405 	TAILQ_INIT(&bios);
406 	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
407 	mtx_unlock(&ns->lock);
408 
409 	while (!TAILQ_EMPTY(&bios)) {
410 		bio = TAILQ_FIRST(&bios);
411 		TAILQ_REMOVE(&bios, bio, bio_queue);
412 		nvmf_ns_strategy(bio);
413 	}
414 }
415 
416 void
417 nvmf_destroy_ns(struct nvmf_namespace *ns)
418 {
419 	TAILQ_HEAD(, bio) bios;
420 	struct bio *bio;
421 
422 	destroy_dev(ns->cdev);
423 
424 	/*
425 	 * Wait for active I/O requests to drain.  The release drops
426 	 * the reference on the "dummy bio" when the namespace is
427 	 * created.
428 	 */
429 	mtx_lock(&ns->lock);
430 	if (!refcount_release(&ns->active_bios)) {
431 		while (ns->active_bios != 0)
432 			mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
433 	}
434 
435 	/* Abort any pending I/O requests. */
436 	TAILQ_INIT(&bios);
437 	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
438 	mtx_unlock(&ns->lock);
439 
440 	while (!TAILQ_EMPTY(&bios)) {
441 		bio = TAILQ_FIRST(&bios);
442 		TAILQ_REMOVE(&bios, bio, bio_queue);
443 		bio->bio_error = ECONNABORTED;
444 		bio->bio_flags |= BIO_ERROR;
445 		bio->bio_resid = bio->bio_bcount;
446 		biodone(bio);
447 	}
448 
449 	mtx_destroy(&ns->lock);
450 	free(ns, M_NVMF);
451 }
452 
453 bool
454 nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
455 {
456 	uint8_t lbads, lbaf;
457 
458 	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
459 		ns_printf(ns, "End-to-end data protection not supported\n");
460 		return (false);
461 	}
462 
463 	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
464 	if (lbaf > data->nlbaf) {
465 		ns_printf(ns, "Invalid LBA format index\n");
466 		return (false);
467 	}
468 
469 	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
470 		ns_printf(ns, "Namespaces with metadata are not supported\n");
471 		return (false);
472 	}
473 
474 	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
475 	if (lbads == 0) {
476 		ns_printf(ns, "Invalid LBA format index\n");
477 		return (false);
478 	}
479 
480 	ns->lba_size = 1 << lbads;
481 	ns->size = data->nsze * ns->lba_size;
482 	return (true);
483 }
484