xref: /freebsd/sys/dev/nvd/nvd.c (revision 13ea0450a9c8742119d36f3bf8f47accdce46e54)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2012-2016 Intel Corporation
5  * All rights reserved.
6  * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/bio.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/queue.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <sys/taskqueue.h>
42 #include <machine/atomic.h>
43 
44 #include <geom/geom.h>
45 #include <geom/geom_disk.h>
46 
47 #include <dev/nvme/nvme.h>
48 
49 #define NVD_STR		"nvd"
50 
51 struct nvd_disk;
52 struct nvd_controller;
53 
54 static disk_ioctl_t nvd_ioctl;
55 static disk_strategy_t nvd_strategy;
56 static dumper_t nvd_dump;
57 
58 static void nvd_done(void *arg, const struct nvme_completion *cpl);
59 static void nvd_gone(struct nvd_disk *ndisk);
60 
61 static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
62 
63 static void *nvd_new_controller(struct nvme_controller *ctrlr);
64 static void nvd_controller_fail(void *ctrlr);
65 
66 static int nvd_load(void);
67 static void nvd_unload(void);
68 
69 MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
70 
71 struct nvme_consumer *consumer_handle;
72 
73 struct nvd_disk {
74 	struct nvd_controller	*ctrlr;
75 
76 	struct bio_queue_head	bioq;
77 	struct task		bioqtask;
78 	struct mtx		bioqlock;
79 
80 	struct disk		*disk;
81 	struct taskqueue	*tq;
82 	struct nvme_namespace	*ns;
83 
84 	uint32_t		cur_depth;
85 #define	NVD_ODEPTH	(1 << 30)
86 	uint32_t		ordered_in_flight;
87 	u_int			unit;
88 
89 	TAILQ_ENTRY(nvd_disk)	global_tailq;
90 	TAILQ_ENTRY(nvd_disk)	ctrlr_tailq;
91 };
92 
93 struct nvd_controller {
94 
95 	TAILQ_ENTRY(nvd_controller)	tailq;
96 	TAILQ_HEAD(, nvd_disk)		disk_head;
97 };
98 
99 static struct mtx			nvd_lock;
100 static TAILQ_HEAD(, nvd_controller)	ctrlr_head;
101 static TAILQ_HEAD(disk_list, nvd_disk)	disk_head;
102 
103 static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD, 0, "nvd driver parameters");
104 /*
105  * The NVMe specification does not define a maximum or optimal delete size, so
106  *  technically max delete size is min(full size of the namespace, 2^32 - 1
107  *  LBAs).  A single delete for a multi-TB NVMe namespace though may take much
108  *  longer to complete than the nvme(4) I/O timeout period.  So choose a sensible
109  *  default here that is still suitably large to minimize the number of overall
110  *  delete operations.
111  */
112 static uint64_t nvd_delete_max = (1024 * 1024 * 1024);  /* 1GB */
113 SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0,
114 	     "nvd maximum BIO_DELETE size in bytes");
115 
116 static int nvd_modevent(module_t mod, int type, void *arg)
117 {
118 	int error = 0;
119 
120 	switch (type) {
121 	case MOD_LOAD:
122 		error = nvd_load();
123 		break;
124 	case MOD_UNLOAD:
125 		nvd_unload();
126 		break;
127 	default:
128 		break;
129 	}
130 
131 	return (error);
132 }
133 
134 moduledata_t nvd_mod = {
135 	NVD_STR,
136 	(modeventhand_t)nvd_modevent,
137 	0
138 };
139 
140 DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
141 MODULE_VERSION(nvd, 1);
142 MODULE_DEPEND(nvd, nvme, 1, 1, 1);
143 
144 static int
145 nvd_load()
146 {
147 	if (!nvme_use_nvd)
148 		return 0;
149 
150 	mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF);
151 	TAILQ_INIT(&ctrlr_head);
152 	TAILQ_INIT(&disk_head);
153 
154 	consumer_handle = nvme_register_consumer(nvd_new_disk,
155 	    nvd_new_controller, NULL, nvd_controller_fail);
156 
157 	return (consumer_handle != NULL ? 0 : -1);
158 }
159 
160 static void
161 nvd_unload()
162 {
163 	struct nvd_controller	*ctrlr;
164 	struct nvd_disk		*ndisk;
165 
166 	if (!nvme_use_nvd)
167 		return;
168 
169 	mtx_lock(&nvd_lock);
170 	while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) {
171 		TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
172 		TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
173 			nvd_gone(ndisk);
174 		while (!TAILQ_EMPTY(&ctrlr->disk_head))
175 			msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0);
176 		free(ctrlr, M_NVD);
177 	}
178 	mtx_unlock(&nvd_lock);
179 
180 	nvme_unregister_consumer(consumer_handle);
181 
182 	mtx_destroy(&nvd_lock);
183 }
184 
185 static void
186 nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp)
187 {
188 	int err;
189 
190 	bp->bio_driver1 = NULL;
191 	if (__predict_false(bp->bio_flags & BIO_ORDERED))
192 		atomic_add_int(&ndisk->cur_depth, NVD_ODEPTH);
193 	else
194 		atomic_add_int(&ndisk->cur_depth, 1);
195 	err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
196 	if (err) {
197 		if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
198 			atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
199 			atomic_add_int(&ndisk->ordered_in_flight, -1);
200 			wakeup(&ndisk->cur_depth);
201 		} else {
202 			if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
203 			    __predict_false(ndisk->ordered_in_flight != 0))
204 				wakeup(&ndisk->cur_depth);
205 		}
206 		bp->bio_error = err;
207 		bp->bio_flags |= BIO_ERROR;
208 		bp->bio_resid = bp->bio_bcount;
209 		biodone(bp);
210 	}
211 }
212 
213 static void
214 nvd_strategy(struct bio *bp)
215 {
216 	struct nvd_disk *ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
217 
218 	/*
219 	 * bio with BIO_ORDERED flag must be executed after all previous
220 	 * bios in the queue, and before any successive bios.
221 	 */
222 	if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
223 		if (atomic_fetchadd_int(&ndisk->ordered_in_flight, 1) == 0 &&
224 		    ndisk->cur_depth == 0 && bioq_first(&ndisk->bioq) == NULL) {
225 			nvd_bio_submit(ndisk, bp);
226 			return;
227 		}
228 	} else if (__predict_true(ndisk->ordered_in_flight == 0)) {
229 		nvd_bio_submit(ndisk, bp);
230 		return;
231 	}
232 
233 	/*
234 	 * There are ordered bios in flight, so we need to submit
235 	 *  bios through the task queue to enforce ordering.
236 	 */
237 	mtx_lock(&ndisk->bioqlock);
238 	bioq_insert_tail(&ndisk->bioq, bp);
239 	mtx_unlock(&ndisk->bioqlock);
240 	taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
241 }
242 
243 static void
244 nvd_gone(struct nvd_disk *ndisk)
245 {
246 	struct bio	*bp;
247 
248 	printf(NVD_STR"%u: detached\n", ndisk->unit);
249 	mtx_lock(&ndisk->bioqlock);
250 	disk_gone(ndisk->disk);
251 	while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) {
252 		if (__predict_false(bp->bio_flags & BIO_ORDERED))
253 			atomic_add_int(&ndisk->ordered_in_flight, -1);
254 		bp->bio_error = ENXIO;
255 		bp->bio_flags |= BIO_ERROR;
256 		bp->bio_resid = bp->bio_bcount;
257 		biodone(bp);
258 	}
259 	mtx_unlock(&ndisk->bioqlock);
260 }
261 
262 static void
263 nvd_gonecb(struct disk *dp)
264 {
265 	struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1;
266 
267 	disk_destroy(ndisk->disk);
268 	mtx_lock(&nvd_lock);
269 	TAILQ_REMOVE(&disk_head, ndisk, global_tailq);
270 	TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq);
271 	if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head))
272 		wakeup(&ndisk->ctrlr->disk_head);
273 	mtx_unlock(&nvd_lock);
274 	taskqueue_free(ndisk->tq);
275 	mtx_destroy(&ndisk->bioqlock);
276 	free(ndisk, M_NVD);
277 }
278 
279 static int
280 nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag,
281     struct thread *td)
282 {
283 	int ret = 0;
284 
285 	switch (cmd) {
286 	default:
287 		ret = EIO;
288 	}
289 
290 	return (ret);
291 }
292 
293 static int
294 nvd_dump(void *arg, void *virt, vm_offset_t phys, off_t offset, size_t len)
295 {
296 	struct disk *dp = arg;
297 	struct nvd_disk *ndisk = dp->d_drv1;
298 
299 	return (nvme_ns_dump(ndisk->ns, virt, offset, len));
300 }
301 
302 static void
303 nvd_done(void *arg, const struct nvme_completion *cpl)
304 {
305 	struct bio *bp = (struct bio *)arg;
306 	struct nvd_disk *ndisk = bp->bio_disk->d_drv1;
307 
308 	if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
309 		atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
310 		atomic_add_int(&ndisk->ordered_in_flight, -1);
311 		wakeup(&ndisk->cur_depth);
312 	} else {
313 		if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
314 		    __predict_false(ndisk->ordered_in_flight != 0))
315 			wakeup(&ndisk->cur_depth);
316 	}
317 
318 	biodone(bp);
319 }
320 
321 static void
322 nvd_bioq_process(void *arg, int pending)
323 {
324 	struct nvd_disk *ndisk = arg;
325 	struct bio *bp;
326 
327 	for (;;) {
328 		mtx_lock(&ndisk->bioqlock);
329 		bp = bioq_takefirst(&ndisk->bioq);
330 		mtx_unlock(&ndisk->bioqlock);
331 		if (bp == NULL)
332 			break;
333 
334 		if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
335 			/*
336 			 * bio with BIO_ORDERED flag set must be executed
337 			 * after all previous bios.
338 			 */
339 			while (ndisk->cur_depth > 0)
340 				tsleep(&ndisk->cur_depth, 0, "nvdorb", 1);
341 		} else {
342 			/*
343 			 * bio with BIO_ORDERED flag set must be completed
344 			 * before proceeding with additional bios.
345 			 */
346 			while (ndisk->cur_depth >= NVD_ODEPTH)
347 				tsleep(&ndisk->cur_depth, 0, "nvdora", 1);
348 		}
349 
350 		nvd_bio_submit(ndisk, bp);
351 	}
352 }
353 
354 static void *
355 nvd_new_controller(struct nvme_controller *ctrlr)
356 {
357 	struct nvd_controller	*nvd_ctrlr;
358 
359 	nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD,
360 	    M_ZERO | M_WAITOK);
361 
362 	TAILQ_INIT(&nvd_ctrlr->disk_head);
363 	mtx_lock(&nvd_lock);
364 	TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
365 	mtx_unlock(&nvd_lock);
366 
367 	return (nvd_ctrlr);
368 }
369 
370 static void *
371 nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
372 {
373 	uint8_t			descr[NVME_MODEL_NUMBER_LENGTH+1];
374 	struct nvd_disk		*ndisk, *tnd;
375 	struct disk		*disk;
376 	struct nvd_controller	*ctrlr = ctrlr_arg;
377 	int unit;
378 
379 	ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
380 	ndisk->ctrlr = ctrlr;
381 	ndisk->ns = ns;
382 	ndisk->cur_depth = 0;
383 	ndisk->ordered_in_flight = 0;
384 	mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF);
385 	bioq_init(&ndisk->bioq);
386 	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
387 
388 	mtx_lock(&nvd_lock);
389 	unit = 0;
390 	TAILQ_FOREACH(tnd, &disk_head, global_tailq) {
391 		if (tnd->unit > unit)
392 			break;
393 		unit = tnd->unit + 1;
394 	}
395 	ndisk->unit = unit;
396 	if (tnd != NULL)
397 		TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq);
398 	else
399 		TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
400 	TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
401 	mtx_unlock(&nvd_lock);
402 
403 	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
404 	    taskqueue_thread_enqueue, &ndisk->tq);
405 	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
406 
407 	disk = ndisk->disk = disk_alloc();
408 	disk->d_strategy = nvd_strategy;
409 	disk->d_ioctl = nvd_ioctl;
410 	disk->d_dump = nvd_dump;
411 	disk->d_gone = nvd_gonecb;
412 	disk->d_name = NVD_STR;
413 	disk->d_unit = ndisk->unit;
414 	disk->d_drv1 = ndisk;
415 
416 	disk->d_sectorsize = nvme_ns_get_sector_size(ns);
417 	disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
418 	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
419 	disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
420 	if (disk->d_delmaxsize > nvd_delete_max)
421 		disk->d_delmaxsize = nvd_delete_max;
422 	disk->d_stripesize = nvme_ns_get_stripesize(ns);
423 	disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
424 	if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
425 		disk->d_flags |= DISKFLAG_CANDELETE;
426 	if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
427 		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
428 
429 	/*
430 	 * d_ident and d_descr are both far bigger than the length of either
431 	 *  the serial or model number strings.
432 	 */
433 	nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns),
434 	    sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
435 	nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr),
436 	    NVME_MODEL_NUMBER_LENGTH);
437 	strlcpy(disk->d_descr, descr, sizeof(descr));
438 
439 	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
440 
441 	disk_create(disk, DISK_VERSION);
442 
443 	printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
444 	printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit,
445 		(uintmax_t)disk->d_mediasize / (1024*1024),
446 		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
447 		disk->d_sectorsize);
448 
449 	return (ndisk);
450 }
451 
452 static void
453 nvd_controller_fail(void *ctrlr_arg)
454 {
455 	struct nvd_controller	*ctrlr = ctrlr_arg;
456 	struct nvd_disk		*ndisk;
457 
458 	mtx_lock(&nvd_lock);
459 	TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
460 	TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
461 		nvd_gone(ndisk);
462 	while (!TAILQ_EMPTY(&ctrlr->disk_head))
463 		msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0);
464 	mtx_unlock(&nvd_lock);
465 	free(ctrlr, M_NVD);
466 }
467 
468