xref: /freebsd/sys/dev/nvme/nvme_ns.c (revision 61ba55bcf70f2340f9c943c9571113b3fd8eda69)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2012-2013 Intel Corporation
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/bus.h>
33 #include <sys/conf.h>
34 #include <sys/disk.h>
35 #include <sys/fcntl.h>
36 #include <sys/ioccom.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
39 #include <sys/proc.h>
40 #include <sys/systm.h>
41 
42 #include <dev/pci/pcivar.h>
43 
44 #include <geom/geom.h>
45 
46 #include "nvme_private.h"
47 
48 static void		nvme_bio_child_inbed(struct bio *parent, int bio_error);
49 static void		nvme_bio_child_done(void *arg,
50 					    const struct nvme_completion *cpl);
51 static uint32_t		nvme_get_num_segments(uint64_t addr, uint64_t size,
52 					      uint32_t alignment);
53 static void		nvme_free_child_bios(int num_bios,
54 					     struct bio **child_bios);
55 static struct bio **	nvme_allocate_child_bios(int num_bios);
56 static struct bio **	nvme_construct_child_bios(struct bio *bp,
57 						  uint32_t alignment,
58 						  int *num_bios);
59 static int		nvme_ns_split_bio(struct nvme_namespace *ns,
60 					  struct bio *bp,
61 					  uint32_t alignment);
62 
63 static int
64 nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
65     struct thread *td)
66 {
67 	struct nvme_namespace			*ns;
68 	struct nvme_controller			*ctrlr;
69 	struct nvme_pt_command			*pt;
70 
71 	ns = cdev->si_drv1;
72 	ctrlr = ns->ctrlr;
73 
74 	switch (cmd) {
75 	case NVME_IO_TEST:
76 	case NVME_BIO_TEST:
77 		nvme_ns_test(ns, cmd, arg);
78 		break;
79 	case NVME_PASSTHROUGH_CMD:
80 		pt = (struct nvme_pt_command *)arg;
81 		return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, ns->id,
82 		    1 /* is_user_buffer */, 0 /* is_admin_cmd */));
83 	case NVME_GET_NSID:
84 	{
85 		struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
86 		strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
87 		    sizeof(gnsid->cdev));
88 		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
89 		gnsid->nsid = ns->id;
90 		break;
91 	}
92 	case DIOCGMEDIASIZE:
93 		*(off_t *)arg = (off_t)nvme_ns_get_size(ns);
94 		break;
95 	case DIOCGSECTORSIZE:
96 		*(u_int *)arg = nvme_ns_get_sector_size(ns);
97 		break;
98 	default:
99 		return (ENOTTY);
100 	}
101 
102 	return (0);
103 }
104 
105 static int
106 nvme_ns_open(struct cdev *dev __unused, int flags, int fmt __unused,
107     struct thread *td)
108 {
109 	int error = 0;
110 
111 	if (flags & FWRITE)
112 		error = securelevel_gt(td->td_ucred, 0);
113 
114 	return (error);
115 }
116 
117 static int
118 nvme_ns_close(struct cdev *dev __unused, int flags, int fmt __unused,
119     struct thread *td)
120 {
121 
122 	return (0);
123 }
124 
125 static void
126 nvme_ns_strategy_done(void *arg, const struct nvme_completion *cpl)
127 {
128 	struct bio *bp = arg;
129 
130 	/*
131 	 * TODO: add more extensive translation of NVMe status codes
132 	 *  to different bio error codes (i.e. EIO, EINVAL, etc.)
133 	 */
134 	if (nvme_completion_is_error(cpl)) {
135 		bp->bio_error = EIO;
136 		bp->bio_flags |= BIO_ERROR;
137 		bp->bio_resid = bp->bio_bcount;
138 	} else
139 		bp->bio_resid = 0;
140 
141 	biodone(bp);
142 }
143 
144 static void
145 nvme_ns_strategy(struct bio *bp)
146 {
147 	struct nvme_namespace	*ns;
148 	int			err;
149 
150 	ns = bp->bio_dev->si_drv1;
151 	err = nvme_ns_bio_process(ns, bp, nvme_ns_strategy_done);
152 
153 	if (err) {
154 		bp->bio_error = err;
155 		bp->bio_flags |= BIO_ERROR;
156 		bp->bio_resid = bp->bio_bcount;
157 		biodone(bp);
158 	}
159 
160 }
161 
162 static struct cdevsw nvme_ns_cdevsw = {
163 	.d_version =	D_VERSION,
164 	.d_flags =	D_DISK,
165 	.d_read =	physread,
166 	.d_write =	physwrite,
167 	.d_open =	nvme_ns_open,
168 	.d_close =	nvme_ns_close,
169 	.d_strategy =	nvme_ns_strategy,
170 	.d_ioctl =	nvme_ns_ioctl
171 };
172 
173 uint32_t
174 nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns)
175 {
176 	return ns->ctrlr->max_xfer_size;
177 }
178 
179 uint32_t
180 nvme_ns_get_sector_size(struct nvme_namespace *ns)
181 {
182 	uint8_t flbas_fmt, lbads;
183 
184 	flbas_fmt = (ns->data.flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) &
185 		NVME_NS_DATA_FLBAS_FORMAT_MASK;
186 	lbads = (ns->data.lbaf[flbas_fmt] >> NVME_NS_DATA_LBAF_LBADS_SHIFT) &
187 		NVME_NS_DATA_LBAF_LBADS_MASK;
188 
189 	return (1 << lbads);
190 }
191 
192 uint64_t
193 nvme_ns_get_num_sectors(struct nvme_namespace *ns)
194 {
195 	return (ns->data.nsze);
196 }
197 
198 uint64_t
199 nvme_ns_get_size(struct nvme_namespace *ns)
200 {
201 	return (nvme_ns_get_num_sectors(ns) * nvme_ns_get_sector_size(ns));
202 }
203 
204 uint32_t
205 nvme_ns_get_flags(struct nvme_namespace *ns)
206 {
207 	return (ns->flags);
208 }
209 
210 const char *
211 nvme_ns_get_serial_number(struct nvme_namespace *ns)
212 {
213 	return ((const char *)ns->ctrlr->cdata.sn);
214 }
215 
216 const char *
217 nvme_ns_get_model_number(struct nvme_namespace *ns)
218 {
219 	return ((const char *)ns->ctrlr->cdata.mn);
220 }
221 
222 const struct nvme_namespace_data *
223 nvme_ns_get_data(struct nvme_namespace *ns)
224 {
225 
226 	return (&ns->data);
227 }
228 
229 uint32_t
230 nvme_ns_get_stripesize(struct nvme_namespace *ns)
231 {
232 	uint32_t ss;
233 
234 	if (((ns->data.nsfeat >> NVME_NS_DATA_NSFEAT_NPVALID_SHIFT) &
235 	    NVME_NS_DATA_NSFEAT_NPVALID_MASK) != 0) {
236 		ss = nvme_ns_get_sector_size(ns);
237 		if (ns->data.npwa != 0)
238 			return ((ns->data.npwa + 1) * ss);
239 		else if (ns->data.npwg != 0)
240 			return ((ns->data.npwg + 1) * ss);
241 	}
242 	return (ns->boundary);
243 }
244 
245 static void
246 nvme_ns_bio_done(void *arg, const struct nvme_completion *status)
247 {
248 	struct bio	*bp = arg;
249 	nvme_cb_fn_t	bp_cb_fn;
250 
251 	bp_cb_fn = bp->bio_driver1;
252 
253 	if (bp->bio_driver2)
254 		free(bp->bio_driver2, M_NVME);
255 
256 	if (nvme_completion_is_error(status)) {
257 		bp->bio_flags |= BIO_ERROR;
258 		if (bp->bio_error == 0)
259 			bp->bio_error = EIO;
260 	}
261 
262 	if ((bp->bio_flags & BIO_ERROR) == 0)
263 		bp->bio_resid = 0;
264 	else
265 		bp->bio_resid = bp->bio_bcount;
266 
267 	bp_cb_fn(bp, status);
268 }
269 
270 static void
271 nvme_bio_child_inbed(struct bio *parent, int bio_error)
272 {
273 	struct nvme_completion	parent_cpl;
274 	int			children, inbed;
275 
276 	if (bio_error != 0) {
277 		parent->bio_flags |= BIO_ERROR;
278 		parent->bio_error = bio_error;
279 	}
280 
281 	/*
282 	 * atomic_fetchadd will return value before adding 1, so we still
283 	 *  must add 1 to get the updated inbed number.  Save bio_children
284 	 *  before incrementing to guard against race conditions when
285 	 *  two children bios complete on different queues.
286 	 */
287 	children = atomic_load_acq_int(&parent->bio_children);
288 	inbed = atomic_fetchadd_int(&parent->bio_inbed, 1) + 1;
289 	if (inbed == children) {
290 		bzero(&parent_cpl, sizeof(parent_cpl));
291 		if (parent->bio_flags & BIO_ERROR) {
292 			parent_cpl.status &= ~(NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT);
293 			parent_cpl.status |= (NVME_SC_DATA_TRANSFER_ERROR) << NVME_STATUS_SC_SHIFT;
294 		}
295 		nvme_ns_bio_done(parent, &parent_cpl);
296 	}
297 }
298 
299 static void
300 nvme_bio_child_done(void *arg, const struct nvme_completion *cpl)
301 {
302 	struct bio		*child = arg;
303 	struct bio		*parent;
304 	int			bio_error;
305 
306 	parent = child->bio_parent;
307 	g_destroy_bio(child);
308 	bio_error = nvme_completion_is_error(cpl) ? EIO : 0;
309 	nvme_bio_child_inbed(parent, bio_error);
310 }
311 
312 static uint32_t
313 nvme_get_num_segments(uint64_t addr, uint64_t size, uint32_t align)
314 {
315 	uint32_t	num_segs, offset, remainder;
316 
317 	if (align == 0)
318 		return (1);
319 
320 	KASSERT((align & (align - 1)) == 0, ("alignment not power of 2\n"));
321 
322 	num_segs = size / align;
323 	remainder = size & (align - 1);
324 	offset = addr & (align - 1);
325 	if (remainder > 0 || offset > 0)
326 		num_segs += 1 + (remainder + offset - 1) / align;
327 	return (num_segs);
328 }
329 
330 static void
331 nvme_free_child_bios(int num_bios, struct bio **child_bios)
332 {
333 	int i;
334 
335 	for (i = 0; i < num_bios; i++) {
336 		if (child_bios[i] != NULL)
337 			g_destroy_bio(child_bios[i]);
338 	}
339 
340 	free(child_bios, M_NVME);
341 }
342 
343 static struct bio **
344 nvme_allocate_child_bios(int num_bios)
345 {
346 	struct bio **child_bios;
347 	int err = 0, i;
348 
349 	child_bios = malloc(num_bios * sizeof(struct bio *), M_NVME, M_NOWAIT);
350 	if (child_bios == NULL)
351 		return (NULL);
352 
353 	for (i = 0; i < num_bios; i++) {
354 		child_bios[i] = g_new_bio();
355 		if (child_bios[i] == NULL)
356 			err = ENOMEM;
357 	}
358 
359 	if (err == ENOMEM) {
360 		nvme_free_child_bios(num_bios, child_bios);
361 		return (NULL);
362 	}
363 
364 	return (child_bios);
365 }
366 
367 static struct bio **
368 nvme_construct_child_bios(struct bio *bp, uint32_t alignment, int *num_bios)
369 {
370 	struct bio	**child_bios;
371 	struct bio	*child;
372 	uint64_t	cur_offset;
373 	caddr_t		data;
374 	uint32_t	rem_bcount;
375 	int		i;
376 	struct vm_page	**ma;
377 	uint32_t	ma_offset;
378 
379 	*num_bios = nvme_get_num_segments(bp->bio_offset, bp->bio_bcount,
380 	    alignment);
381 	child_bios = nvme_allocate_child_bios(*num_bios);
382 	if (child_bios == NULL)
383 		return (NULL);
384 
385 	bp->bio_children = *num_bios;
386 	bp->bio_inbed = 0;
387 	cur_offset = bp->bio_offset;
388 	rem_bcount = bp->bio_bcount;
389 	data = bp->bio_data;
390 	ma_offset = bp->bio_ma_offset;
391 	ma = bp->bio_ma;
392 
393 	for (i = 0; i < *num_bios; i++) {
394 		child = child_bios[i];
395 		child->bio_parent = bp;
396 		child->bio_cmd = bp->bio_cmd;
397 		child->bio_offset = cur_offset;
398 		child->bio_bcount = min(rem_bcount,
399 		    alignment - (cur_offset & (alignment - 1)));
400 		child->bio_flags = bp->bio_flags;
401 		if (bp->bio_flags & BIO_UNMAPPED) {
402 			child->bio_ma_offset = ma_offset;
403 			child->bio_ma = ma;
404 			child->bio_ma_n =
405 			    nvme_get_num_segments(child->bio_ma_offset,
406 				child->bio_bcount, PAGE_SIZE);
407 			ma_offset = (ma_offset + child->bio_bcount) &
408 			    PAGE_MASK;
409 			ma += child->bio_ma_n;
410 			if (ma_offset != 0)
411 				ma -= 1;
412 		} else {
413 			child->bio_data = data;
414 			data += child->bio_bcount;
415 		}
416 		cur_offset += child->bio_bcount;
417 		rem_bcount -= child->bio_bcount;
418 	}
419 
420 	return (child_bios);
421 }
422 
423 static int
424 nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp,
425     uint32_t alignment)
426 {
427 	struct bio	*child;
428 	struct bio	**child_bios;
429 	int		err, i, num_bios;
430 
431 	child_bios = nvme_construct_child_bios(bp, alignment, &num_bios);
432 	if (child_bios == NULL)
433 		return (ENOMEM);
434 
435 	for (i = 0; i < num_bios; i++) {
436 		child = child_bios[i];
437 		err = nvme_ns_bio_process(ns, child, nvme_bio_child_done);
438 		if (err != 0) {
439 			nvme_bio_child_inbed(bp, err);
440 			g_destroy_bio(child);
441 		}
442 	}
443 
444 	free(child_bios, M_NVME);
445 	return (0);
446 }
447 
448 int
449 nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
450 	nvme_cb_fn_t cb_fn)
451 {
452 	struct nvme_dsm_range	*dsm_range;
453 	uint32_t		num_bios;
454 	int			err;
455 
456 	bp->bio_driver1 = cb_fn;
457 
458 	if (ns->boundary > 0 &&
459 	    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
460 		num_bios = nvme_get_num_segments(bp->bio_offset,
461 		    bp->bio_bcount, ns->boundary);
462 		if (num_bios > 1)
463 			return (nvme_ns_split_bio(ns, bp, ns->boundary));
464 	}
465 
466 	switch (bp->bio_cmd) {
467 	case BIO_READ:
468 		err = nvme_ns_cmd_read_bio(ns, bp, nvme_ns_bio_done, bp);
469 		break;
470 	case BIO_WRITE:
471 		err = nvme_ns_cmd_write_bio(ns, bp, nvme_ns_bio_done, bp);
472 		break;
473 	case BIO_FLUSH:
474 		err = nvme_ns_cmd_flush(ns, nvme_ns_bio_done, bp);
475 		break;
476 	case BIO_DELETE:
477 		dsm_range =
478 		    malloc(sizeof(struct nvme_dsm_range), M_NVME,
479 		    M_ZERO | M_NOWAIT);
480 		if (!dsm_range) {
481 			err = ENOMEM;
482 			break;
483 		}
484 		dsm_range->length =
485 		    htole32(bp->bio_bcount/nvme_ns_get_sector_size(ns));
486 		dsm_range->starting_lba =
487 		    htole64(bp->bio_offset/nvme_ns_get_sector_size(ns));
488 		bp->bio_driver2 = dsm_range;
489 		err = nvme_ns_cmd_deallocate(ns, dsm_range, 1,
490 			nvme_ns_bio_done, bp);
491 		if (err != 0)
492 			free(dsm_range, M_NVME);
493 		break;
494 	default:
495 		err = EOPNOTSUPP;
496 		break;
497 	}
498 
499 	return (err);
500 }
501 
502 int
503 nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd, caddr_t arg,
504     int flag, struct thread *td)
505 {
506 	return (nvme_ns_ioctl(ns->cdev, cmd, arg, flag, td));
507 }
508 
509 int
510 nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
511     struct nvme_controller *ctrlr)
512 {
513 	struct make_dev_args                    md_args;
514 	struct nvme_completion_poll_status	status;
515 	int                                     res;
516 	int					unit;
517 	uint8_t					flbas_fmt;
518 	uint8_t					vwc_present;
519 
520 	ns->ctrlr = ctrlr;
521 	ns->id = id;
522 
523 	/*
524 	 * Namespaces are reconstructed after a controller reset, so check
525 	 *  to make sure we only call mtx_init once on each mtx.
526 	 *
527 	 * TODO: Move this somewhere where it gets called at controller
528 	 *  construction time, which is not invoked as part of each
529 	 *  controller reset.
530 	 */
531 	if (!mtx_initialized(&ns->lock))
532 		mtx_init(&ns->lock, "nvme ns lock", NULL, MTX_DEF);
533 
534 	status.done = 0;
535 	nvme_ctrlr_cmd_identify_namespace(ctrlr, id, &ns->data,
536 	    nvme_completion_poll_cb, &status);
537 	nvme_completion_poll(&status);
538 	if (nvme_completion_is_error(&status.cpl)) {
539 		nvme_printf(ctrlr, "nvme_identify_namespace failed\n");
540 		return (ENXIO);
541 	}
542 
543 	/* Convert data to host endian */
544 	nvme_namespace_data_swapbytes(&ns->data);
545 
546 	/*
547 	 * If the size of is zero, chances are this isn't a valid
548 	 * namespace (eg one that's not been configured yet). The
549 	 * standard says the entire id will be zeros, so this is a
550 	 * cheap way to test for that.
551 	 */
552 	if (ns->data.nsze == 0)
553 		return (ENXIO);
554 
555 	flbas_fmt = (ns->data.flbas >> NVME_NS_DATA_FLBAS_FORMAT_SHIFT) &
556 		NVME_NS_DATA_FLBAS_FORMAT_MASK;
557 	/*
558 	 * Note: format is a 0-based value, so > is appropriate here,
559 	 *  not >=.
560 	 */
561 	if (flbas_fmt > ns->data.nlbaf) {
562 		nvme_printf(ctrlr,
563 		    "lba format %d exceeds number supported (%d)\n",
564 		    flbas_fmt, ns->data.nlbaf + 1);
565 		return (ENXIO);
566 	}
567 
568 	/*
569 	 * Older Intel devices (like the PC35xxx and P45xx series) advertise in
570 	 * vendor specific space an alignment that improves performance.  If
571 	 * present use for the stripe size.  NVMe 1.3 standardized this as
572 	 * NOIOB, and newer Intel drives use that.
573 	 */
574 	if ((ctrlr->quirks & QUIRK_INTEL_ALIGNMENT) != 0) {
575 		if (ctrlr->cdata.vs[3] != 0)
576 			ns->boundary =
577 			    1 << (ctrlr->cdata.vs[3] + NVME_MPS_SHIFT +
578 				NVME_CAP_HI_MPSMIN(ctrlr->cap_hi));
579 		else
580 			ns->boundary = 0;
581 	} else {
582 		ns->boundary = ns->data.noiob * nvme_ns_get_sector_size(ns);
583 	}
584 
585 	if (nvme_ctrlr_has_dataset_mgmt(&ctrlr->cdata))
586 		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
587 
588 	vwc_present = (ctrlr->cdata.vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) &
589 		NVME_CTRLR_DATA_VWC_PRESENT_MASK;
590 	if (vwc_present)
591 		ns->flags |= NVME_NS_FLUSH_SUPPORTED;
592 
593 	/*
594 	 * cdev may have already been created, if we are reconstructing the
595 	 *  namespace after a controller-level reset.
596 	 */
597 	if (ns->cdev != NULL)
598 		return (0);
599 
600 	/*
601 	 * Namespace IDs start at 1, so we need to subtract 1 to create a
602 	 *  correct unit number.
603 	 */
604 	unit = device_get_unit(ctrlr->dev) * NVME_MAX_NAMESPACES + ns->id - 1;
605 
606 	make_dev_args_init(&md_args);
607 	md_args.mda_devsw = &nvme_ns_cdevsw;
608 	md_args.mda_unit = unit;
609 	md_args.mda_mode = 0600;
610 	md_args.mda_si_drv1 = ns;
611 	res = make_dev_s(&md_args, &ns->cdev, "nvme%dns%d",
612 	    device_get_unit(ctrlr->dev), ns->id);
613 	if (res != 0)
614 		return (ENXIO);
615 
616 	ns->cdev->si_flags |= SI_UNMAPPED;
617 
618 	return (0);
619 }
620 
621 void
622 nvme_ns_destruct(struct nvme_namespace *ns)
623 {
624 
625 	if (ns->cdev != NULL)
626 		destroy_dev(ns->cdev);
627 }
628