xref: /freebsd/sys/dev/nvd/nvd.c (revision 28f4385e45a2681c14bd04b83fe1796eaefe8265)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2012-2016 Intel Corporation
5  * All rights reserved.
6  * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/bio.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/queue.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <sys/taskqueue.h>
42 #include <machine/atomic.h>
43 
44 #include <geom/geom.h>
45 #include <geom/geom_disk.h>
46 
47 #include <dev/nvme/nvme.h>
48 
49 #define NVD_STR		"nvd"
50 
51 struct nvd_disk;
52 struct nvd_controller;
53 
54 static disk_ioctl_t nvd_ioctl;
55 static disk_strategy_t nvd_strategy;
56 static dumper_t nvd_dump;
57 
58 static void nvd_done(void *arg, const struct nvme_completion *cpl);
59 static void nvd_gone(struct nvd_disk *ndisk);
60 
61 static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
62 
63 static void *nvd_new_controller(struct nvme_controller *ctrlr);
64 static void nvd_controller_fail(void *ctrlr);
65 
66 static int nvd_load(void);
67 static void nvd_unload(void);
68 
69 MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
70 
71 struct nvme_consumer *consumer_handle;
72 
73 struct nvd_disk {
74 	struct nvd_controller	*ctrlr;
75 
76 	struct bio_queue_head	bioq;
77 	struct task		bioqtask;
78 	struct mtx		bioqlock;
79 
80 	struct disk		*disk;
81 	struct taskqueue	*tq;
82 	struct nvme_namespace	*ns;
83 
84 	uint32_t		cur_depth;
85 	uint32_t		ordered_in_flight;
86 	u_int			unit;
87 
88 	TAILQ_ENTRY(nvd_disk)	global_tailq;
89 	TAILQ_ENTRY(nvd_disk)	ctrlr_tailq;
90 };
91 
92 struct nvd_controller {
93 
94 	TAILQ_ENTRY(nvd_controller)	tailq;
95 	TAILQ_HEAD(, nvd_disk)		disk_head;
96 };
97 
98 static struct mtx			nvd_lock;
99 static TAILQ_HEAD(, nvd_controller)	ctrlr_head;
100 static TAILQ_HEAD(disk_list, nvd_disk)	disk_head;
101 
102 static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD, 0, "nvd driver parameters");
103 /*
104  * The NVMe specification does not define a maximum or optimal delete size, so
105  *  technically max delete size is min(full size of the namespace, 2^32 - 1
106  *  LBAs).  A single delete for a multi-TB NVMe namespace though may take much
107  *  longer to complete than the nvme(4) I/O timeout period.  So choose a sensible
108  *  default here that is still suitably large to minimize the number of overall
109  *  delete operations.
110  */
111 static uint64_t nvd_delete_max = (1024 * 1024 * 1024);  /* 1GB */
112 SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0,
113 	     "nvd maximum BIO_DELETE size in bytes");
114 
115 static int nvd_modevent(module_t mod, int type, void *arg)
116 {
117 	int error = 0;
118 
119 	switch (type) {
120 	case MOD_LOAD:
121 		error = nvd_load();
122 		break;
123 	case MOD_UNLOAD:
124 		nvd_unload();
125 		break;
126 	default:
127 		break;
128 	}
129 
130 	return (error);
131 }
132 
133 moduledata_t nvd_mod = {
134 	NVD_STR,
135 	(modeventhand_t)nvd_modevent,
136 	0
137 };
138 
139 DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
140 MODULE_VERSION(nvd, 1);
141 MODULE_DEPEND(nvd, nvme, 1, 1, 1);
142 
143 static int
144 nvd_load()
145 {
146 	if (!nvme_use_nvd)
147 		return 0;
148 
149 	mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF);
150 	TAILQ_INIT(&ctrlr_head);
151 	TAILQ_INIT(&disk_head);
152 
153 	consumer_handle = nvme_register_consumer(nvd_new_disk,
154 	    nvd_new_controller, NULL, nvd_controller_fail);
155 
156 	return (consumer_handle != NULL ? 0 : -1);
157 }
158 
159 static void
160 nvd_unload()
161 {
162 	struct nvd_controller	*ctrlr;
163 	struct nvd_disk		*ndisk;
164 
165 	if (!nvme_use_nvd)
166 		return;
167 
168 	mtx_lock(&nvd_lock);
169 	while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) {
170 		TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
171 		TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
172 			nvd_gone(ndisk);
173 		while (!TAILQ_EMPTY(&ctrlr->disk_head))
174 			msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0);
175 		free(ctrlr, M_NVD);
176 	}
177 	mtx_unlock(&nvd_lock);
178 
179 	nvme_unregister_consumer(consumer_handle);
180 
181 	mtx_destroy(&nvd_lock);
182 }
183 
184 static int
185 nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp)
186 {
187 	int err;
188 
189 	bp->bio_driver1 = NULL;
190 	atomic_add_int(&ndisk->cur_depth, 1);
191 	err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
192 	if (err) {
193 		atomic_add_int(&ndisk->cur_depth, -1);
194 		if (__predict_false(bp->bio_flags & BIO_ORDERED))
195 			atomic_add_int(&ndisk->ordered_in_flight, -1);
196 		bp->bio_error = err;
197 		bp->bio_flags |= BIO_ERROR;
198 		bp->bio_resid = bp->bio_bcount;
199 		biodone(bp);
200 		return (-1);
201 	}
202 
203 	return (0);
204 }
205 
206 static void
207 nvd_strategy(struct bio *bp)
208 {
209 	struct nvd_disk *ndisk;
210 
211 	ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
212 
213 	if (__predict_false(bp->bio_flags & BIO_ORDERED))
214 		atomic_add_int(&ndisk->ordered_in_flight, 1);
215 
216 	if (__predict_true(ndisk->ordered_in_flight == 0)) {
217 		nvd_bio_submit(ndisk, bp);
218 		return;
219 	}
220 
221 	/*
222 	 * There are ordered bios in flight, so we need to submit
223 	 *  bios through the task queue to enforce ordering.
224 	 */
225 	mtx_lock(&ndisk->bioqlock);
226 	bioq_insert_tail(&ndisk->bioq, bp);
227 	mtx_unlock(&ndisk->bioqlock);
228 	taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
229 }
230 
231 static void
232 nvd_gone(struct nvd_disk *ndisk)
233 {
234 	struct bio	*bp;
235 
236 	printf(NVD_STR"%u: detached\n", ndisk->unit);
237 	mtx_lock(&ndisk->bioqlock);
238 	disk_gone(ndisk->disk);
239 	while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) {
240 		if (__predict_false(bp->bio_flags & BIO_ORDERED))
241 			atomic_add_int(&ndisk->ordered_in_flight, -1);
242 		bp->bio_error = ENXIO;
243 		bp->bio_flags |= BIO_ERROR;
244 		bp->bio_resid = bp->bio_bcount;
245 		biodone(bp);
246 	}
247 	mtx_unlock(&ndisk->bioqlock);
248 }
249 
250 static void
251 nvd_gonecb(struct disk *dp)
252 {
253 	struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1;
254 
255 	disk_destroy(ndisk->disk);
256 	mtx_lock(&nvd_lock);
257 	TAILQ_REMOVE(&disk_head, ndisk, global_tailq);
258 	TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq);
259 	if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head))
260 		wakeup(&ndisk->ctrlr->disk_head);
261 	mtx_unlock(&nvd_lock);
262 	taskqueue_free(ndisk->tq);
263 	mtx_destroy(&ndisk->bioqlock);
264 	free(ndisk, M_NVD);
265 }
266 
267 static int
268 nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag,
269     struct thread *td)
270 {
271 	int ret = 0;
272 
273 	switch (cmd) {
274 	default:
275 		ret = EIO;
276 	}
277 
278 	return (ret);
279 }
280 
281 static int
282 nvd_dump(void *arg, void *virt, vm_offset_t phys, off_t offset, size_t len)
283 {
284 	struct nvd_disk *ndisk;
285 	struct disk *dp;
286 
287 	dp = arg;
288 	ndisk = dp->d_drv1;
289 
290 	return (nvme_ns_dump(ndisk->ns, virt, offset, len));
291 }
292 
293 static void
294 nvd_done(void *arg, const struct nvme_completion *cpl)
295 {
296 	struct bio *bp;
297 	struct nvd_disk *ndisk;
298 
299 	bp = (struct bio *)arg;
300 
301 	ndisk = bp->bio_disk->d_drv1;
302 
303 	atomic_add_int(&ndisk->cur_depth, -1);
304 	if (__predict_false(bp->bio_flags & BIO_ORDERED))
305 		atomic_add_int(&ndisk->ordered_in_flight, -1);
306 
307 	biodone(bp);
308 }
309 
310 static void
311 nvd_bioq_process(void *arg, int pending)
312 {
313 	struct nvd_disk *ndisk = arg;
314 	struct bio *bp;
315 
316 	for (;;) {
317 		mtx_lock(&ndisk->bioqlock);
318 		bp = bioq_takefirst(&ndisk->bioq);
319 		mtx_unlock(&ndisk->bioqlock);
320 		if (bp == NULL)
321 			break;
322 
323 		if (nvd_bio_submit(ndisk, bp) != 0) {
324 			continue;
325 		}
326 
327 #ifdef BIO_ORDERED
328 		/*
329 		 * BIO_ORDERED flag dictates that the bio with BIO_ORDERED
330 		 *  flag set must be completed before proceeding with
331 		 *  additional bios.
332 		 */
333 		if (bp->bio_flags & BIO_ORDERED) {
334 			while (ndisk->cur_depth > 0) {
335 				pause("nvd flush", 1);
336 			}
337 		}
338 #endif
339 	}
340 }
341 
342 static void *
343 nvd_new_controller(struct nvme_controller *ctrlr)
344 {
345 	struct nvd_controller	*nvd_ctrlr;
346 
347 	nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD,
348 	    M_ZERO | M_WAITOK);
349 
350 	TAILQ_INIT(&nvd_ctrlr->disk_head);
351 	mtx_lock(&nvd_lock);
352 	TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
353 	mtx_unlock(&nvd_lock);
354 
355 	return (nvd_ctrlr);
356 }
357 
358 static void *
359 nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
360 {
361 	uint8_t			descr[NVME_MODEL_NUMBER_LENGTH+1];
362 	struct nvd_disk		*ndisk, *tnd;
363 	struct disk		*disk;
364 	struct nvd_controller	*ctrlr = ctrlr_arg;
365 	int unit;
366 
367 	ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
368 	ndisk->ctrlr = ctrlr;
369 	ndisk->ns = ns;
370 	ndisk->cur_depth = 0;
371 	ndisk->ordered_in_flight = 0;
372 	mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF);
373 	bioq_init(&ndisk->bioq);
374 	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
375 
376 	mtx_lock(&nvd_lock);
377 	unit = 0;
378 	TAILQ_FOREACH(tnd, &disk_head, global_tailq) {
379 		if (tnd->unit > unit)
380 			break;
381 		unit = tnd->unit + 1;
382 	}
383 	ndisk->unit = unit;
384 	if (tnd != NULL)
385 		TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq);
386 	else
387 		TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
388 	TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
389 	mtx_unlock(&nvd_lock);
390 
391 	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
392 	    taskqueue_thread_enqueue, &ndisk->tq);
393 	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
394 
395 	disk = ndisk->disk = disk_alloc();
396 	disk->d_strategy = nvd_strategy;
397 	disk->d_ioctl = nvd_ioctl;
398 	disk->d_dump = nvd_dump;
399 	disk->d_gone = nvd_gonecb;
400 	disk->d_name = NVD_STR;
401 	disk->d_unit = ndisk->unit;
402 	disk->d_drv1 = ndisk;
403 
404 	disk->d_sectorsize = nvme_ns_get_sector_size(ns);
405 	disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
406 	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
407 	disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
408 	if (disk->d_delmaxsize > nvd_delete_max)
409 		disk->d_delmaxsize = nvd_delete_max;
410 	disk->d_stripesize = nvme_ns_get_stripesize(ns);
411 	disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
412 	if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
413 		disk->d_flags |= DISKFLAG_CANDELETE;
414 	if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
415 		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
416 
417 	/*
418 	 * d_ident and d_descr are both far bigger than the length of either
419 	 *  the serial or model number strings.
420 	 */
421 	nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns),
422 	    sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
423 	nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr),
424 	    NVME_MODEL_NUMBER_LENGTH);
425 	strlcpy(disk->d_descr, descr, sizeof(descr));
426 
427 	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
428 
429 	disk_create(disk, DISK_VERSION);
430 
431 	printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
432 	printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit,
433 		(uintmax_t)disk->d_mediasize / (1024*1024),
434 		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
435 		disk->d_sectorsize);
436 
437 	return (ndisk);
438 }
439 
440 static void
441 nvd_controller_fail(void *ctrlr_arg)
442 {
443 	struct nvd_controller	*ctrlr = ctrlr_arg;
444 	struct nvd_disk		*ndisk;
445 
446 	mtx_lock(&nvd_lock);
447 	TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
448 	TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
449 		nvd_gone(ndisk);
450 	while (!TAILQ_EMPTY(&ctrlr->disk_head))
451 		msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0);
452 	mtx_unlock(&nvd_lock);
453 	free(ctrlr, M_NVD);
454 }
455 
456