xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision 44bac77bf8165ebe38afb85dda247b928d88edf8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/sunldi.h>
36 
37 /*
38  * Virtual device vector for disks.
39  */
40 
41 extern ldi_ident_t zfs_li;
42 
43 typedef struct vdev_disk_buf {
44 	buf_t	vdb_buf;
45 	zio_t	*vdb_io;
46 } vdev_disk_buf_t;
47 
48 static int
49 vdev_disk_open_common(vdev_t *vd)
50 {
51 	vdev_disk_t *dvd;
52 	dev_t dev;
53 	int error;
54 
55 	/*
56 	 * We must have a pathname, and it must be absolute.
57 	 */
58 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
59 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
60 		return (EINVAL);
61 	}
62 
63 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
64 
65 	/*
66 	 * When opening a disk device, we want to preserve the user's original
67 	 * intent.  We always want to open the device by the path the user gave
68 	 * us, even if it is one of multiple paths to the save device.  But we
69 	 * also want to be able to survive disks being removed/recabled.
70 	 * Therefore the sequence of opening devices is:
71 	 *
72 	 * 1. Try opening the device by path.  For legacy pools without the
73 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
74 	 *
75 	 * 2. If the devid of the device matches the stored value, return
76 	 *    success.
77 	 *
78 	 * 3. Otherwise, the device may have moved.  Try opening the device
79 	 *    by the devid instead.
80 	 *
81 	 */
82 	if (vd->vdev_devid != NULL) {
83 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
84 		    &dvd->vd_minor) != 0) {
85 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
86 			return (EINVAL);
87 		}
88 	}
89 
90 	error = EINVAL;		/* presume failure */
91 
92 	if (vd->vdev_path != NULL) {
93 		ddi_devid_t devid;
94 
95 		if (vd->vdev_wholedisk == -1ULL) {
96 			size_t len = strlen(vd->vdev_path) + 3;
97 			char *buf = kmem_alloc(len, KM_SLEEP);
98 			ldi_handle_t lh;
99 
100 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
101 
102 			if (ldi_open_by_name(buf, spa_mode, kcred,
103 			    &lh, zfs_li) == 0) {
104 				spa_strfree(vd->vdev_path);
105 				vd->vdev_path = buf;
106 				vd->vdev_wholedisk = 1ULL;
107 				(void) ldi_close(lh, spa_mode, kcred);
108 			} else {
109 				kmem_free(buf, len);
110 			}
111 		}
112 
113 		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
114 		    &dvd->vd_lh, zfs_li);
115 
116 		/*
117 		 * Compare the devid to the stored value.
118 		 */
119 		if (error == 0 && vd->vdev_devid != NULL &&
120 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
121 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
122 				error = EINVAL;
123 				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
124 				dvd->vd_lh = NULL;
125 			}
126 			ddi_devid_free(devid);
127 		}
128 
129 		/*
130 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
131 		 * is not yet set, then this must be a slice.
132 		 */
133 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
134 			vd->vdev_wholedisk = 0;
135 	}
136 
137 	/*
138 	 * If we were unable to open by path, or the devid check fails, open by
139 	 * devid instead.
140 	 */
141 	if (error != 0 && vd->vdev_devid != NULL)
142 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
143 		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
144 
145 	/*
146 	 * If all else fails, then try opening by physical path (if available)
147 	 * or the logical path (if we failed due to the devid check).  While not
148 	 * as reliable as the devid, this will give us something, and the higher
149 	 * level vdev validation will prevent us from opening the wrong device.
150 	 */
151 	if (error) {
152 		if (vd->vdev_physpath != NULL &&
153 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
154 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
155 			    kcred, &dvd->vd_lh, zfs_li);
156 
157 		/*
158 		 * Note that we don't support the legacy auto-wholedisk support
159 		 * as above.  This hasn't been used in a very long time and we
160 		 * don't need to propagate its oddities to this edge condition.
161 		 */
162 		if (error && vd->vdev_path != NULL)
163 			error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
164 			    &dvd->vd_lh, zfs_li);
165 	}
166 
167 	if (error)
168 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
169 
170 	return (error);
171 }
172 
173 static int
174 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
175 {
176 	vdev_disk_t *dvd;
177 	struct dk_minfo dkm;
178 	int error;
179 	dev_t dev;
180 	int otyp;
181 
182 	error = vdev_disk_open_common(vd);
183 	if (error)
184 		return (error);
185 
186 	dvd = vd->vdev_tsd;
187 	/*
188 	 * Once a device is opened, verify that the physical device path (if
189 	 * available) is up to date.
190 	 */
191 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
192 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
193 		char *physpath, *minorname;
194 
195 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
196 		minorname = NULL;
197 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
198 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
199 		    (vd->vdev_physpath == NULL ||
200 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
201 			if (vd->vdev_physpath)
202 				spa_strfree(vd->vdev_physpath);
203 			(void) strlcat(physpath, ":", MAXPATHLEN);
204 			(void) strlcat(physpath, minorname, MAXPATHLEN);
205 			vd->vdev_physpath = spa_strdup(physpath);
206 		}
207 		if (minorname)
208 			kmem_free(minorname, strlen(minorname) + 1);
209 		kmem_free(physpath, MAXPATHLEN);
210 	}
211 
212 	/*
213 	 * Determine the actual size of the device.
214 	 */
215 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
216 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
217 		return (EINVAL);
218 	}
219 
220 	/*
221 	 * If we own the whole disk, try to enable disk write caching.
222 	 * We ignore errors because it's OK if we can't do it.
223 	 */
224 	if (vd->vdev_wholedisk == 1) {
225 		int wce = 1;
226 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
227 		    FKIOCTL, kcred, NULL);
228 	}
229 
230 	/*
231 	 * Determine the device's minimum transfer size.
232 	 * If the ioctl isn't supported, assume DEV_BSIZE.
233 	 */
234 	if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
235 	    FKIOCTL, kcred, NULL) != 0)
236 		dkm.dki_lbsize = DEV_BSIZE;
237 
238 	*ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
239 
240 	/*
241 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
242 	 * try again.
243 	 */
244 	vd->vdev_nowritecache = B_FALSE;
245 
246 	return (0);
247 }
248 
249 static void
250 vdev_disk_close(vdev_t *vd)
251 {
252 	vdev_disk_t *dvd = vd->vdev_tsd;
253 
254 	if (dvd == NULL)
255 		return;
256 
257 	if (dvd->vd_minor != NULL)
258 		ddi_devid_str_free(dvd->vd_minor);
259 
260 	if (dvd->vd_devid != NULL)
261 		ddi_devid_free(dvd->vd_devid);
262 
263 	if (dvd->vd_lh != NULL)
264 		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
265 
266 	kmem_free(dvd, sizeof (vdev_disk_t));
267 	vd->vdev_tsd = NULL;
268 }
269 
270 int
271 vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
272     uint64_t offset, int flags)
273 {
274 	buf_t *bp;
275 	int error = 0;
276 
277 	if (vd_lh == NULL)
278 		return (EINVAL);
279 
280 	ASSERT(flags & B_READ || flags & B_WRITE);
281 
282 	bp = getrbuf(KM_SLEEP);
283 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
284 	bp->b_bcount = size;
285 	bp->b_un.b_addr = (void *)data;
286 	bp->b_lblkno = lbtodb(offset);
287 	bp->b_bufsize = size;
288 
289 	error = ldi_strategy(vd_lh, bp);
290 	ASSERT(error == 0);
291 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
292 		error = EIO;
293 	freerbuf(bp);
294 
295 	return (error);
296 }
297 
298 static int
299 vdev_disk_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
300     int flags)
301 {
302 	int error = 0;
303 	vdev_disk_t *dvd = vd ? vd->vdev_tsd : NULL;
304 
305 	if (vd == NULL || dvd == NULL || dvd->vd_lh == NULL)
306 		return (EINVAL);
307 
308 	error = vdev_disk_physio(dvd->vd_lh, data, size, offset, flags);
309 
310 	if (zio_injection_enabled && error == 0)
311 		error = zio_handle_device_injection(vd, EIO);
312 
313 	return (error);
314 }
315 
316 /*
317  * Determine if the underlying device is accessible by reading and writing
318  * to a known location. We must be able to do this during syncing context
319  * and thus we cannot set the vdev state directly.
320  */
321 static int
322 vdev_disk_probe(vdev_t *vd)
323 {
324 	uint64_t offset;
325 	vdev_t *nvd;
326 	int l, error = 0, retries = 0;
327 	char *vl_pad;
328 
329 	if (vd == NULL)
330 		return (EINVAL);
331 
332 	/* Hijack the current vdev */
333 	nvd = vd;
334 
335 	/*
336 	 * Pick a random label to rewrite.
337 	 */
338 	l = spa_get_random(VDEV_LABELS);
339 	ASSERT(l < VDEV_LABELS);
340 
341 	offset = vdev_label_offset(vd->vdev_psize, l,
342 	    offsetof(vdev_label_t, vl_pad));
343 
344 	vl_pad = kmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
345 
346 	/*
347 	 * Try to read and write to a special location on the
348 	 * label. We use the existing vdev initially and only
349 	 * try to create and reopen it if we encounter a failure.
350 	 */
351 	while ((error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
352 	    offset, B_READ)) != 0 && retries == 0) {
353 
354 		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
355 		if (vd->vdev_path)
356 			nvd->vdev_path = spa_strdup(vd->vdev_path);
357 		if (vd->vdev_physpath)
358 			nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
359 		if (vd->vdev_devid)
360 			nvd->vdev_devid = spa_strdup(vd->vdev_devid);
361 		nvd->vdev_wholedisk = vd->vdev_wholedisk;
362 		nvd->vdev_guid = vd->vdev_guid;
363 		retries++;
364 
365 		error = vdev_disk_open_common(nvd);
366 		if (error)
367 			break;
368 	}
369 
370 	if (!error) {
371 		error = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
372 		    offset, B_WRITE);
373 	}
374 
375 	/* Clean up if we allocated a new vdev */
376 	if (retries) {
377 		vdev_disk_close(nvd);
378 		if (nvd->vdev_path)
379 			spa_strfree(nvd->vdev_path);
380 		if (nvd->vdev_physpath)
381 			spa_strfree(nvd->vdev_physpath);
382 		if (nvd->vdev_devid)
383 			spa_strfree(nvd->vdev_devid);
384 		kmem_free(nvd, sizeof (vdev_t));
385 	}
386 	kmem_free(vl_pad, VDEV_SKIP_SIZE);
387 
388 	/* Reset the failing flag */
389 	if (!error)
390 		vd->vdev_is_failing = B_FALSE;
391 
392 	return (error);
393 }
394 
395 static void
396 vdev_disk_io_intr(buf_t *bp)
397 {
398 	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
399 	zio_t *zio = vdb->vdb_io;
400 
401 	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
402 		zio->io_error = EIO;
403 
404 	kmem_free(vdb, sizeof (vdev_disk_buf_t));
405 
406 	zio_interrupt(zio);
407 }
408 
409 static void
410 vdev_disk_ioctl_done(void *zio_arg, int error)
411 {
412 	zio_t *zio = zio_arg;
413 
414 	zio->io_error = error;
415 
416 	zio_interrupt(zio);
417 }
418 
419 static int
420 vdev_disk_io_start(zio_t *zio)
421 {
422 	vdev_t *vd = zio->io_vd;
423 	vdev_disk_t *dvd = vd->vdev_tsd;
424 	vdev_disk_buf_t *vdb;
425 	buf_t *bp;
426 	int flags, error;
427 
428 	if (zio->io_type == ZIO_TYPE_IOCTL) {
429 		zio_vdev_io_bypass(zio);
430 
431 		/* XXPOLICY */
432 		if (!vdev_readable(vd)) {
433 			zio->io_error = ENXIO;
434 			return (ZIO_PIPELINE_CONTINUE);
435 		}
436 
437 		switch (zio->io_cmd) {
438 
439 		case DKIOCFLUSHWRITECACHE:
440 
441 			if (zfs_nocacheflush)
442 				break;
443 
444 			if (vd->vdev_nowritecache) {
445 				zio->io_error = ENOTSUP;
446 				break;
447 			}
448 
449 			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
450 			zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE;
451 			zio->io_dk_callback.dkc_cookie = zio;
452 
453 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
454 			    (uintptr_t)&zio->io_dk_callback,
455 			    FKIOCTL, kcred, NULL);
456 
457 			if (error == 0) {
458 				/*
459 				 * The ioctl will be done asychronously,
460 				 * and will call vdev_disk_ioctl_done()
461 				 * upon completion.
462 				 */
463 				return (ZIO_PIPELINE_STOP);
464 			}
465 
466 			if (error == ENOTSUP || error == ENOTTY) {
467 				/*
468 				 * If we get ENOTSUP or ENOTTY, we know that
469 				 * no future attempts will ever succeed.
470 				 * In this case we set a persistent bit so
471 				 * that we don't bother with the ioctl in the
472 				 * future.
473 				 */
474 				vd->vdev_nowritecache = B_TRUE;
475 			}
476 			zio->io_error = error;
477 
478 			break;
479 
480 		default:
481 			zio->io_error = ENOTSUP;
482 		}
483 
484 		return (ZIO_PIPELINE_CONTINUE);
485 	}
486 
487 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
488 		return (ZIO_PIPELINE_STOP);
489 
490 	if ((zio = vdev_queue_io(zio)) == NULL)
491 		return (ZIO_PIPELINE_STOP);
492 
493 	if (zio->io_type == ZIO_TYPE_WRITE)
494 		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
495 	else
496 		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
497 	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
498 
499 	if (error) {
500 		zio->io_error = error;
501 		zio_interrupt(zio);
502 		return (ZIO_PIPELINE_STOP);
503 	}
504 
505 	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
506 	flags |= B_BUSY | B_NOCACHE;
507 	if (zio->io_flags & ZIO_FLAG_FAILFAST)
508 		flags |= B_FAILFAST;
509 
510 	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
511 
512 	vdb->vdb_io = zio;
513 	bp = &vdb->vdb_buf;
514 
515 	bioinit(bp);
516 	bp->b_flags = flags;
517 	bp->b_bcount = zio->io_size;
518 	bp->b_un.b_addr = zio->io_data;
519 	bp->b_lblkno = lbtodb(zio->io_offset);
520 	bp->b_bufsize = zio->io_size;
521 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
522 
523 	error = ldi_strategy(dvd->vd_lh, bp);
524 	/* ldi_strategy() will return non-zero only on programming errors */
525 	ASSERT(error == 0);
526 
527 	return (ZIO_PIPELINE_STOP);
528 }
529 
530 static int
531 vdev_disk_io_done(zio_t *zio)
532 {
533 	vdev_queue_io_done(zio);
534 
535 	if (zio->io_type == ZIO_TYPE_WRITE)
536 		vdev_cache_write(zio);
537 
538 	if (zio_injection_enabled && zio->io_error == 0)
539 		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
540 
541 	/*
542 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
543 	 * the device has been removed.  If this is the case, then we trigger an
544 	 * asynchronous removal of the device. Otherwise, probe the device and
545 	 * make sure it's still accessible.
546 	 */
547 	if (zio->io_error == EIO) {
548 		vdev_t *vd = zio->io_vd;
549 		vdev_disk_t *dvd = vd->vdev_tsd;
550 		int state;
551 
552 		state = DKIO_NONE;
553 		if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
554 		    FKIOCTL, kcred, NULL) == 0 &&
555 		    state != DKIO_INSERTED) {
556 			vd->vdev_remove_wanted = B_TRUE;
557 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
558 		} else if (vdev_probe(vd) != 0) {
559 			ASSERT(vd->vdev_ops->vdev_op_leaf);
560 			vd->vdev_is_failing = B_TRUE;
561 		}
562 	}
563 
564 	return (ZIO_PIPELINE_CONTINUE);
565 }
566 
567 vdev_ops_t vdev_disk_ops = {
568 	vdev_disk_open,
569 	vdev_disk_close,
570 	vdev_disk_probe,
571 	vdev_default_asize,
572 	vdev_disk_io_start,
573 	vdev_disk_io_done,
574 	NULL,
575 	VDEV_TYPE_DISK,		/* name of this vdev type */
576 	B_TRUE			/* leaf vdev */
577 };
578 
579 /*
580  * Given the root disk device pathname, read the label from the device,
581  * and construct a configuration nvlist.
582  */
583 nvlist_t *
584 vdev_disk_read_rootlabel(char *devpath)
585 {
586 	nvlist_t *config = NULL;
587 	ldi_handle_t vd_lh;
588 	vdev_label_t *label;
589 	uint64_t s, size;
590 	int l;
591 
592 	/*
593 	 * Read the device label and build the nvlist.
594 	 */
595 	if (ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, zfs_li))
596 		return (NULL);
597 
598 	if (ldi_get_size(vd_lh, &s))
599 		return (NULL);
600 
601 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
602 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
603 
604 	for (l = 0; l < VDEV_LABELS; l++) {
605 		uint64_t offset, state, txg = 0;
606 
607 		/* read vdev label */
608 		offset = vdev_label_offset(size, l, 0);
609 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
610 		    VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
611 		    VDEV_PHYS_SIZE, offset, B_READ) != 0)
612 			continue;
613 
614 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
615 		    sizeof (label->vl_vdev_phys.vp_nvlist), &config, 0) != 0) {
616 			config = NULL;
617 			continue;
618 		}
619 
620 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
621 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
622 			nvlist_free(config);
623 			config = NULL;
624 			continue;
625 		}
626 
627 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
628 		    &txg) != 0 || txg == 0) {
629 			nvlist_free(config);
630 			config = NULL;
631 			continue;
632 		}
633 
634 		break;
635 	}
636 
637 	kmem_free(label, sizeof (vdev_label_t));
638 	return (config);
639 }
640