xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_disk.c (revision e912cc3d5decbbfbb3005d9f678e9fc3ccbcf91f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright 2020 Joyent, Inc.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa_impl.h>
30 #include <sys/refcount.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_trim.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/zio.h>
36 #include <sys/sunldi.h>
37 #include <sys/efi_partition.h>
38 #include <sys/fm/fs/zfs.h>
39 #include <sys/ddi.h>
40 
41 /*
42  * Tunable to disable TRIM in case we're using a problematic SSD.
43  */
44 uint_t zfs_no_trim = 0;
45 
46 /*
47  * Tunable parameter for debugging or performance analysis. Setting this
48  * will cause pool corruption on power loss if a volatile out-of-order
49  * write cache is enabled.
50  */
51 boolean_t zfs_nocacheflush = B_FALSE;
52 
53 /*
54  * Virtual device vector for disks.
55  */
56 
57 extern ldi_ident_t zfs_li;
58 
59 static void vdev_disk_close(vdev_t *);
60 
61 typedef struct vdev_disk {
62 	ddi_devid_t	vd_devid;
63 	char		*vd_minor;
64 	ldi_handle_t	vd_lh;
65 	list_t		vd_ldi_cbs;
66 	boolean_t	vd_ldi_offline;
67 } vdev_disk_t;
68 
69 typedef struct vdev_disk_buf {
70 	buf_t	vdb_buf;
71 	zio_t	*vdb_io;
72 } vdev_disk_buf_t;
73 
74 typedef struct vdev_disk_ldi_cb {
75 	list_node_t		lcb_next;
76 	ldi_callback_id_t	lcb_id;
77 } vdev_disk_ldi_cb_t;
78 
79 /*
80  * Bypass the devid when opening a disk vdev.
81  * There have been issues where the devids of several devices were shuffled,
82  * causing pool open failures. Note, that this flag is intended to be used
83  * for pool recovery only.
84  *
85  * Note that if a pool is imported with the devids bypassed, all its vdevs will
86  * cease storing devid information permanently. In practice, the devid is rarely
87  * useful as vdev paths do not tend to change unless the hardware is
88  * reconfigured. That said, if the paths do change and a pool fails to open
89  * automatically at boot, a simple zpool import should re-scan the paths and fix
90  * the issue.
91  */
92 boolean_t vdev_disk_bypass_devid = B_FALSE;
93 
94 static void
95 vdev_disk_alloc(vdev_t *vd)
96 {
97 	vdev_disk_t *dvd;
98 
99 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
100 	/*
101 	 * Create the LDI event callback list.
102 	 */
103 	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
104 	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
105 }
106 
107 static void
108 vdev_disk_free(vdev_t *vd)
109 {
110 	vdev_disk_t *dvd = vd->vdev_tsd;
111 	vdev_disk_ldi_cb_t *lcb;
112 
113 	if (dvd == NULL)
114 		return;
115 
116 	/*
117 	 * We have already closed the LDI handle. Clean up the LDI event
118 	 * callbacks and free vd->vdev_tsd.
119 	 */
120 	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
121 		list_remove(&dvd->vd_ldi_cbs, lcb);
122 		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
123 		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
124 	}
125 	list_destroy(&dvd->vd_ldi_cbs);
126 	kmem_free(dvd, sizeof (vdev_disk_t));
127 	vd->vdev_tsd = NULL;
128 }
129 
130 static int
131 vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
132     void *arg, void *ev_data __unused)
133 {
134 	vdev_t *vd = (vdev_t *)arg;
135 	vdev_disk_t *dvd = vd->vdev_tsd;
136 
137 	/*
138 	 * Ignore events other than offline.
139 	 */
140 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
141 		return (LDI_EV_SUCCESS);
142 
143 	/*
144 	 * Tell any new threads that stumble upon this vdev that they should not
145 	 * try to do I/O.
146 	 */
147 	dvd->vd_ldi_offline = B_TRUE;
148 
149 	/*
150 	 * Request that the spa_async_thread mark the device as REMOVED and
151 	 * notify FMA of the removal.  This should also trigger a vdev_close()
152 	 * in the async thread.
153 	 */
154 	zfs_post_remove(vd->vdev_spa, vd);
155 	vd->vdev_remove_wanted = B_TRUE;
156 	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
157 
158 	return (LDI_EV_SUCCESS);
159 }
160 
161 static void
162 vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
163     int ldi_result, void *arg, void *ev_data __unused)
164 {
165 	vdev_t *vd = (vdev_t *)arg;
166 
167 	/*
168 	 * Ignore events other than offline.
169 	 */
170 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
171 		return;
172 
173 	/*
174 	 * Request that the vdev be reopened if the offline state change was
175 	 * unsuccessful.
176 	 */
177 	if (ldi_result != LDI_EV_SUCCESS) {
178 		vd->vdev_probe_wanted = B_TRUE;
179 		spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
180 	}
181 }
182 
183 static ldi_ev_callback_t vdev_disk_off_callb = {
184 	.cb_vers = LDI_EV_CB_VERS,
185 	.cb_notify = vdev_disk_off_notify,
186 	.cb_finalize = vdev_disk_off_finalize
187 };
188 
189 static void
190 vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie,
191     int ldi_result, void *arg, void *ev_data __unused)
192 {
193 	vdev_t *vd = (vdev_t *)arg;
194 
195 	/*
196 	 * Ignore events other than degrade.
197 	 */
198 	if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
199 		return;
200 
201 	/*
202 	 * Degrade events always succeed. Mark the vdev as degraded.
203 	 * This status is purely informative for the user.
204 	 */
205 	(void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
206 }
207 
208 static ldi_ev_callback_t vdev_disk_dgrd_callb = {
209 	.cb_vers = LDI_EV_CB_VERS,
210 	.cb_notify = NULL,
211 	.cb_finalize = vdev_disk_dgrd_finalize
212 };
213 
214 static void
215 vdev_disk_hold(vdev_t *vd)
216 {
217 	ddi_devid_t devid;
218 	char *minor;
219 
220 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
221 
222 	/*
223 	 * We must have a pathname, and it must be absolute.
224 	 */
225 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
226 		return;
227 
228 	/*
229 	 * Only prefetch path and devid info if the device has
230 	 * never been opened.
231 	 */
232 	if (vd->vdev_tsd != NULL)
233 		return;
234 
235 	if (vd->vdev_wholedisk == -1ULL) {
236 		size_t len = strlen(vd->vdev_path) + 3;
237 		char *buf = kmem_alloc(len, KM_SLEEP);
238 
239 		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
240 
241 		(void) ldi_vp_from_name(buf, &vd->vdev_name_vp);
242 		kmem_free(buf, len);
243 	}
244 
245 	if (vd->vdev_name_vp == NULL)
246 		(void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp);
247 
248 	if (vd->vdev_devid != NULL &&
249 	    ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) {
250 		(void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp);
251 		ddi_devid_str_free(minor);
252 		ddi_devid_free(devid);
253 	}
254 }
255 
256 static void
257 vdev_disk_rele(vdev_t *vd)
258 {
259 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
260 
261 	if (vd->vdev_name_vp) {
262 		VN_RELE_ASYNC(vd->vdev_name_vp,
263 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
264 		vd->vdev_name_vp = NULL;
265 	}
266 	if (vd->vdev_devid_vp) {
267 		VN_RELE_ASYNC(vd->vdev_devid_vp,
268 		    dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool));
269 		vd->vdev_devid_vp = NULL;
270 	}
271 }
272 
273 /*
274  * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
275  * even a fallback to DKIOCGMEDIAINFO fails.
276  */
277 #ifdef DEBUG
278 #define	VDEV_DEBUG(...)	cmn_err(CE_NOTE, __VA_ARGS__)
279 #else
280 #define	VDEV_DEBUG(...)	/* Nothing... */
281 #endif
282 
283 static int
284 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
285     uint64_t *ashift)
286 {
287 	spa_t *spa = vd->vdev_spa;
288 	vdev_disk_t *dvd = vd->vdev_tsd;
289 	ldi_ev_cookie_t ecookie;
290 	vdev_disk_ldi_cb_t *lcb;
291 	union {
292 		struct dk_minfo_ext ude;
293 		struct dk_minfo ud;
294 	} dks;
295 	struct dk_minfo_ext *dkmext = &dks.ude;
296 	struct dk_minfo *dkm = &dks.ud;
297 	int error, can_free;
298 	dev_t dev;
299 	int otyp;
300 	boolean_t validate_devid = B_FALSE;
301 	uint64_t capacity = 0, blksz = 0, pbsize;
302 
303 	/*
304 	 * We must have a pathname, and it must be absolute.
305 	 */
306 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
307 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
308 		return (SET_ERROR(EINVAL));
309 	}
310 
311 	/*
312 	 * Reopen the device if it's not currently open. Otherwise,
313 	 * just update the physical size of the device.
314 	 */
315 	if (dvd != NULL) {
316 		ASSERT(vd->vdev_reopening);
317 		goto skip_open;
318 	}
319 
320 	/*
321 	 * Create vd->vdev_tsd.
322 	 */
323 	vdev_disk_alloc(vd);
324 	dvd = vd->vdev_tsd;
325 
326 	/*
327 	 * Allow bypassing the devid.
328 	 */
329 	if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) {
330 		vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed",
331 		    vd->vdev_devid);
332 		spa_strfree(vd->vdev_devid);
333 		vd->vdev_devid = NULL;
334 	}
335 
336 	/*
337 	 * When opening a disk device, we want to preserve the user's original
338 	 * intent.  We always want to open the device by the path the user gave
339 	 * us, even if it is one of multiple paths to the same device.  But we
340 	 * also want to be able to survive disks being removed/recabled.
341 	 * Therefore the sequence of opening devices is:
342 	 *
343 	 * 1. Try opening the device by path.  For legacy pools without the
344 	 *    'whole_disk' property, attempt to fix the path by appending 's0'.
345 	 *
346 	 * 2. If the devid of the device matches the stored value, return
347 	 *    success.
348 	 *
349 	 * 3. Otherwise, the device may have moved.  Try opening the device
350 	 *    by the devid instead.
351 	 */
352 	if (vd->vdev_devid != NULL) {
353 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
354 		    &dvd->vd_minor) != 0) {
355 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
356 			vdev_dbgmsg(vd, "vdev_disk_open: invalid "
357 			    "vdev_devid '%s'", vd->vdev_devid);
358 			return (SET_ERROR(EINVAL));
359 		}
360 	}
361 
362 	error = EINVAL;		/* presume failure */
363 
364 	if (vd->vdev_path != NULL) {
365 
366 		if (vd->vdev_wholedisk == -1ULL) {
367 			size_t len = strlen(vd->vdev_path) + 3;
368 			char *buf = kmem_alloc(len, KM_SLEEP);
369 
370 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
371 
372 			error = ldi_open_by_name(buf, spa_mode(spa), kcred,
373 			    &dvd->vd_lh, zfs_li);
374 			if (error == 0) {
375 				spa_strfree(vd->vdev_path);
376 				vd->vdev_path = buf;
377 				vd->vdev_wholedisk = 1ULL;
378 			} else {
379 				kmem_free(buf, len);
380 			}
381 		}
382 
383 		/*
384 		 * If we have not yet opened the device, try to open it by the
385 		 * specified path.
386 		 */
387 		if (error != 0) {
388 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
389 			    kcred, &dvd->vd_lh, zfs_li);
390 		}
391 
392 		/*
393 		 * Compare the devid to the stored value.
394 		 */
395 		if (error == 0 && vd->vdev_devid != NULL) {
396 			ddi_devid_t devid = NULL;
397 
398 			if (ldi_get_devid(dvd->vd_lh, &devid) != 0) {
399 				/*
400 				 * We expected a devid on this device but it no
401 				 * longer appears to have one.  The validation
402 				 * step may need to remove it from the
403 				 * configuration.
404 				 */
405 				validate_devid = B_TRUE;
406 
407 			} else if (ddi_devid_compare(devid, dvd->vd_devid) !=
408 			    0) {
409 				/*
410 				 * A mismatch here is unexpected, log it.
411 				 */
412 				char *devid_str = ddi_devid_str_encode(devid,
413 				    dvd->vd_minor);
414 				vdev_dbgmsg(vd, "vdev_disk_open: devid "
415 				    "mismatch: %s != %s", vd->vdev_devid,
416 				    devid_str);
417 				cmn_err(CE_NOTE, "vdev_disk_open %s: devid "
418 				    "mismatch: %s != %s", vd->vdev_path,
419 				    vd->vdev_devid, devid_str);
420 				ddi_devid_str_free(devid_str);
421 
422 				error = SET_ERROR(EINVAL);
423 				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
424 				    kcred);
425 				dvd->vd_lh = NULL;
426 			}
427 
428 			if (devid != NULL) {
429 				ddi_devid_free(devid);
430 			}
431 		}
432 
433 		/*
434 		 * If we succeeded in opening the device, but 'vdev_wholedisk'
435 		 * is not yet set, then this must be a slice.
436 		 */
437 		if (error == 0 && vd->vdev_wholedisk == -1ULL)
438 			vd->vdev_wholedisk = 0;
439 	}
440 
441 	/*
442 	 * If we were unable to open by path, or the devid check fails, open by
443 	 * devid instead.
444 	 */
445 	if (error != 0 && vd->vdev_devid != NULL) {
446 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
447 		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
448 		if (error != 0) {
449 			vdev_dbgmsg(vd, "Failed to open by devid (%s)",
450 			    vd->vdev_devid);
451 		}
452 	}
453 
454 	/*
455 	 * If all else fails, then try opening by physical path (if available)
456 	 * or the logical path (if we failed due to the devid check).  While not
457 	 * as reliable as the devid, this will give us something, and the higher
458 	 * level vdev validation will prevent us from opening the wrong device.
459 	 */
460 	if (error != 0) {
461 		validate_devid = B_TRUE;
462 
463 		if (vd->vdev_physpath != NULL &&
464 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) {
465 			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
466 			    kcred, &dvd->vd_lh, zfs_li);
467 		}
468 
469 		/*
470 		 * Note that we don't support the legacy auto-wholedisk support
471 		 * as above.  This hasn't been used in a very long time and we
472 		 * don't need to propagate its oddities to this edge condition.
473 		 */
474 		if (error != 0 && vd->vdev_path != NULL) {
475 			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
476 			    kcred, &dvd->vd_lh, zfs_li);
477 		}
478 	}
479 
480 	if (error != 0) {
481 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
482 		vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
483 		    error);
484 		return (error);
485 	}
486 
487 	/*
488 	 * Now that the device has been successfully opened, update the devid
489 	 * if necessary.
490 	 */
491 	if (validate_devid) {
492 		ddi_devid_t devid = NULL;
493 		char *minorname = NULL;
494 		char *vd_devid = NULL;
495 		boolean_t remove = B_FALSE, update = B_FALSE;
496 
497 		/*
498 		 * Get the current devid and minor name for the device we
499 		 * opened.
500 		 */
501 		if (ldi_get_devid(dvd->vd_lh, &devid) != 0 ||
502 		    ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) {
503 			/*
504 			 * If we are unable to get the devid or the minor name
505 			 * for the device, we need to remove them from the
506 			 * configuration to prevent potential inconsistencies.
507 			 */
508 			if (dvd->vd_minor != NULL || dvd->vd_devid != NULL ||
509 			    vd->vdev_devid != NULL) {
510 				/*
511 				 * We only need to remove the devid if one
512 				 * exists.
513 				 */
514 				remove = B_TRUE;
515 			}
516 
517 		} else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) {
518 			/*
519 			 * There was previously no devid at all so we need to
520 			 * add one.
521 			 */
522 			update = B_TRUE;
523 
524 		} else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 ||
525 		    strcmp(minorname, dvd->vd_minor) != 0) {
526 			/*
527 			 * The devid or minor name on file does not match the
528 			 * one from the opened device.
529 			 */
530 			update = B_TRUE;
531 		}
532 
533 		if (update) {
534 			/*
535 			 * Render the new devid and minor name as a string for
536 			 * logging and to store in the vdev configuration.
537 			 */
538 			vd_devid = ddi_devid_str_encode(devid, minorname);
539 		}
540 
541 		if (update || remove) {
542 			vdev_dbgmsg(vd, "vdev_disk_open: update devid from "
543 			    "'%s' to '%s'",
544 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
545 			    vd_devid != NULL ? vd_devid : "<none>");
546 			cmn_err(CE_NOTE, "vdev_disk_open %s: update devid "
547 			    "from '%s' to '%s'",
548 			    vd->vdev_path != NULL ? vd->vdev_path : "?",
549 			    vd->vdev_devid != NULL ? vd->vdev_devid : "<none>",
550 			    vd_devid != NULL ? vd_devid : "<none>");
551 
552 			/*
553 			 * Remove and free any existing values.
554 			 */
555 			if (dvd->vd_minor != NULL) {
556 				ddi_devid_str_free(dvd->vd_minor);
557 				dvd->vd_minor = NULL;
558 			}
559 			if (dvd->vd_devid != NULL) {
560 				ddi_devid_free(dvd->vd_devid);
561 				dvd->vd_devid = NULL;
562 			}
563 			if (vd->vdev_devid != NULL) {
564 				spa_strfree(vd->vdev_devid);
565 				vd->vdev_devid = NULL;
566 			}
567 		}
568 
569 		if (update) {
570 			/*
571 			 * Install the new values.
572 			 */
573 			vd->vdev_devid = vd_devid;
574 			dvd->vd_minor = minorname;
575 			dvd->vd_devid = devid;
576 
577 		} else {
578 			if (devid != NULL) {
579 				ddi_devid_free(devid);
580 			}
581 			if (minorname != NULL) {
582 				kmem_free(minorname, strlen(minorname) + 1);
583 			}
584 		}
585 	}
586 
587 	/*
588 	 * Once a device is opened, verify that the physical device path (if
589 	 * available) is up to date.
590 	 */
591 	if (ldi_get_dev(dvd->vd_lh, &dev) == 0 &&
592 	    ldi_get_otyp(dvd->vd_lh, &otyp) == 0) {
593 		char *physpath, *minorname;
594 
595 		physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
596 		minorname = NULL;
597 		if (ddi_dev_pathname(dev, otyp, physpath) == 0 &&
598 		    ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 &&
599 		    (vd->vdev_physpath == NULL ||
600 		    strcmp(vd->vdev_physpath, physpath) != 0)) {
601 			if (vd->vdev_physpath)
602 				spa_strfree(vd->vdev_physpath);
603 			(void) strlcat(physpath, ":", MAXPATHLEN);
604 			(void) strlcat(physpath, minorname, MAXPATHLEN);
605 			vd->vdev_physpath = spa_strdup(physpath);
606 		}
607 		if (minorname)
608 			kmem_free(minorname, strlen(minorname) + 1);
609 		kmem_free(physpath, MAXPATHLEN);
610 	}
611 
612 	/*
613 	 * Register callbacks for the LDI offline event.
614 	 */
615 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
616 	    LDI_EV_SUCCESS) {
617 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
618 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
619 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
620 		    &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
621 	}
622 
623 	/*
624 	 * Register callbacks for the LDI degrade event.
625 	 */
626 	if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
627 	    LDI_EV_SUCCESS) {
628 		lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
629 		list_insert_tail(&dvd->vd_ldi_cbs, lcb);
630 		(void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
631 		    &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
632 	}
633 
634 skip_open:
635 	/*
636 	 * Determine the actual size of the device.
637 	 */
638 	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
639 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
640 		vdev_dbgmsg(vd, "vdev_disk_open: failed to get size");
641 		return (SET_ERROR(EINVAL));
642 	}
643 
644 	*max_psize = *psize;
645 
646 	/*
647 	 * Determine the device's minimum transfer size.
648 	 * If the ioctl isn't supported, assume DEV_BSIZE.
649 	 */
650 	if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
651 	    (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
652 		capacity = dkmext->dki_capacity - 1;
653 		blksz = dkmext->dki_lbsize;
654 		pbsize = dkmext->dki_pbsize;
655 	} else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
656 	    (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
657 		VDEV_DEBUG(
658 		    "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
659 		    vd->vdev_path);
660 		capacity = dkm->dki_capacity - 1;
661 		blksz = dkm->dki_lbsize;
662 		pbsize = blksz;
663 	} else {
664 		VDEV_DEBUG("vdev_disk_open(\"%s\"): "
665 		    "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
666 		    vd->vdev_path, error);
667 		pbsize = DEV_BSIZE;
668 	}
669 
670 	*ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
671 
672 	if (vd->vdev_wholedisk == 1) {
673 		int wce = 1;
674 
675 		if (error == 0) {
676 			/*
677 			 * If we have the capability to expand, we'd have
678 			 * found out via success from DKIOCGMEDIAINFO{,EXT}.
679 			 * Adjust max_psize upward accordingly since we know
680 			 * we own the whole disk now.
681 			 */
682 			*max_psize = capacity * blksz;
683 		}
684 
685 		/*
686 		 * Since we own the whole disk, try to enable disk write
687 		 * caching.  We ignore errors because it's OK if we can't do it.
688 		 */
689 		(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
690 		    FKIOCTL, kcred, NULL);
691 	}
692 
693 	/*
694 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
695 	 * try again.
696 	 */
697 	vd->vdev_nowritecache = B_FALSE;
698 
699 	if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL,
700 	    kcred, NULL) == 0 && can_free == 1) {
701 		vd->vdev_has_trim = B_TRUE;
702 	} else {
703 		vd->vdev_has_trim = B_FALSE;
704 	}
705 
706 	if (zfs_no_trim == 1)
707 		vd->vdev_has_trim = B_FALSE;
708 
709 	/* Currently only supported for ZoL. */
710 	vd->vdev_has_securetrim = B_FALSE;
711 
712 	/* Inform the ZIO pipeline that we are non-rotational */
713 	vd->vdev_nonrot = B_FALSE;
714 	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
715 	    "device-solid-state")) {
716 		if (ldi_prop_get_int(dvd->vd_lh,
717 		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
718 		    "device-solid-state", B_FALSE) != 0)
719 			vd->vdev_nonrot = B_TRUE;
720 	}
721 
722 	return (0);
723 }
724 
725 static void
726 vdev_disk_close(vdev_t *vd)
727 {
728 	vdev_disk_t *dvd = vd->vdev_tsd;
729 
730 	if (vd->vdev_reopening || dvd == NULL)
731 		return;
732 
733 	if (dvd->vd_minor != NULL) {
734 		ddi_devid_str_free(dvd->vd_minor);
735 		dvd->vd_minor = NULL;
736 	}
737 
738 	if (dvd->vd_devid != NULL) {
739 		ddi_devid_free(dvd->vd_devid);
740 		dvd->vd_devid = NULL;
741 	}
742 
743 	if (dvd->vd_lh != NULL) {
744 		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
745 		dvd->vd_lh = NULL;
746 	}
747 
748 	vd->vdev_delayed_close = B_FALSE;
749 	vdev_disk_free(vd);
750 }
751 
752 static int
753 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
754     size_t size, uint64_t offset, int flags)
755 {
756 	buf_t *bp;
757 	int error = 0;
758 
759 	if (vd_lh == NULL)
760 		return (SET_ERROR(EINVAL));
761 
762 	ASSERT(flags & B_READ || flags & B_WRITE);
763 
764 	bp = getrbuf(KM_SLEEP);
765 	bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST;
766 	bp->b_bcount = size;
767 	bp->b_un.b_addr = (void *)data;
768 	bp->b_lblkno = lbtodb(offset);
769 	bp->b_bufsize = size;
770 
771 	error = ldi_strategy(vd_lh, bp);
772 	ASSERT(error == 0);
773 	if ((error = biowait(bp)) == 0 && bp->b_resid != 0)
774 		error = SET_ERROR(EIO);
775 	freerbuf(bp);
776 
777 	return (error);
778 }
779 
780 static int
781 vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size,
782     uint64_t offset, uint64_t origoffset __unused, boolean_t doread,
783     boolean_t isdump)
784 {
785 	vdev_disk_t *dvd = vd->vdev_tsd;
786 	int flags = doread ? B_READ : B_WRITE;
787 
788 	/*
789 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
790 	 * Nothing to be done here but return failure.
791 	 */
792 	if (dvd == NULL || dvd->vd_ldi_offline) {
793 		return (SET_ERROR(ENXIO));
794 	}
795 
796 	ASSERT(vd->vdev_ops == &vdev_disk_ops);
797 
798 	offset += VDEV_LABEL_START_SIZE;
799 
800 	/*
801 	 * If in the context of an active crash dump, use the ldi_dump(9F)
802 	 * call instead of ldi_strategy(9F) as usual.
803 	 */
804 	if (isdump) {
805 		ASSERT3P(dvd, !=, NULL);
806 		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
807 		    lbtodb(size)));
808 	}
809 
810 	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
811 }
812 
813 static int
814 vdev_disk_io_intr(buf_t *bp)
815 {
816 	vdev_buf_t *vb = (vdev_buf_t *)bp;
817 	zio_t *zio = vb->vb_io;
818 
819 	/*
820 	 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
821 	 * Rather than teach the rest of the stack about other error
822 	 * possibilities (EFAULT, etc), we normalize the error value here.
823 	 */
824 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
825 
826 	if (zio->io_error == 0 && bp->b_resid != 0)
827 		zio->io_error = SET_ERROR(EIO);
828 
829 	if (zio->io_type == ZIO_TYPE_READ) {
830 		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
831 	} else {
832 		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
833 	}
834 
835 	kmem_free(vb, sizeof (vdev_buf_t));
836 
837 	zio_delay_interrupt(zio);
838 	return (0);
839 }
840 
841 static void
842 vdev_disk_ioctl_free(zio_t *zio)
843 {
844 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
845 }
846 
847 static const zio_vsd_ops_t vdev_disk_vsd_ops = {
848 	vdev_disk_ioctl_free,
849 	zio_vsd_default_cksum_report
850 };
851 
852 static void
853 vdev_disk_ioctl_done(void *zio_arg, int error)
854 {
855 	zio_t *zio = zio_arg;
856 
857 	zio->io_error = error;
858 
859 	zio_interrupt(zio);
860 }
861 
862 static void
863 vdev_disk_io_start(zio_t *zio)
864 {
865 	vdev_t *vd = zio->io_vd;
866 	vdev_disk_t *dvd = vd->vdev_tsd;
867 	unsigned long trim_flags = 0;
868 	vdev_buf_t *vb;
869 	struct dk_callback *dkc;
870 	buf_t *bp;
871 	int error;
872 
873 	/*
874 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
875 	 * Nothing to be done here but return failure.
876 	 */
877 	if (dvd == NULL || dvd->vd_ldi_offline) {
878 		zio->io_error = ENXIO;
879 		zio_interrupt(zio);
880 		return;
881 	}
882 
883 	switch (zio->io_type) {
884 	case ZIO_TYPE_IOCTL:
885 		/* XXPOLICY */
886 		if (!vdev_readable(vd)) {
887 			zio->io_error = SET_ERROR(ENXIO);
888 			zio_interrupt(zio);
889 			return;
890 		}
891 
892 		switch (zio->io_cmd) {
893 
894 		case DKIOCFLUSHWRITECACHE:
895 
896 			if (zfs_nocacheflush)
897 				break;
898 
899 			if (vd->vdev_nowritecache) {
900 				zio->io_error = SET_ERROR(ENOTSUP);
901 				break;
902 			}
903 
904 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
905 			zio->io_vsd_ops = &vdev_disk_vsd_ops;
906 
907 			dkc->dkc_callback = vdev_disk_ioctl_done;
908 			dkc->dkc_flag = FLUSH_VOLATILE;
909 			dkc->dkc_cookie = zio;
910 
911 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
912 			    (uintptr_t)dkc, FKIOCTL, kcred, NULL);
913 
914 			if (error == 0) {
915 				/*
916 				 * The ioctl will be done asychronously,
917 				 * and will call vdev_disk_ioctl_done()
918 				 * upon completion.
919 				 */
920 				return;
921 			}
922 
923 			zio->io_error = error;
924 
925 			break;
926 
927 		default:
928 			zio->io_error = SET_ERROR(ENOTSUP);
929 		}
930 
931 		zio_execute(zio);
932 		return;
933 
934 	case ZIO_TYPE_TRIM:
935 		if (zfs_no_trim == 1 || !vd->vdev_has_trim) {
936 			zio->io_error = SET_ERROR(ENOTSUP);
937 			zio_execute(zio);
938 			return;
939 		}
940 		/* Currently only supported on ZoL. */
941 		ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE);
942 
943 		/* dkioc_free_list_t is already declared to hold one entry */
944 		dkioc_free_list_t dfl;
945 		dfl.dfl_flags = 0;
946 		dfl.dfl_num_exts = 1;
947 		dfl.dfl_offset = 0;
948 		dfl.dfl_exts[0].dfle_start = zio->io_offset;
949 		dfl.dfl_exts[0].dfle_length = zio->io_size;
950 
951 		zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE,
952 		    (uintptr_t)&dfl, FKIOCTL, kcred, NULL);
953 
954 		if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) {
955 			/*
956 			 * The device must have changed and now TRIM is
957 			 * no longer supported.
958 			 */
959 			vd->vdev_has_trim = B_FALSE;
960 		}
961 
962 		zio_interrupt(zio);
963 		return;
964 	}
965 
966 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
967 	zio->io_target_timestamp = zio_handle_io_delay(zio);
968 
969 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
970 
971 	vb->vb_io = zio;
972 	bp = &vb->vb_buf;
973 
974 	bioinit(bp);
975 	bp->b_flags = B_BUSY | B_NOCACHE |
976 	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
977 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
978 		bp->b_flags |= B_FAILFAST;
979 	bp->b_bcount = zio->io_size;
980 
981 	if (zio->io_type == ZIO_TYPE_READ) {
982 		bp->b_un.b_addr =
983 		    abd_borrow_buf(zio->io_abd, zio->io_size);
984 	} else {
985 		bp->b_un.b_addr =
986 		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
987 	}
988 
989 	bp->b_lblkno = lbtodb(zio->io_offset);
990 	bp->b_bufsize = zio->io_size;
991 	bp->b_iodone = vdev_disk_io_intr;
992 
993 	/*
994 	 * In general we would expect ldi_strategy() to return non-zero only
995 	 * because of programming errors, but we've also seen this fail shortly
996 	 * after a disk dies.
997 	 */
998 	if (ldi_strategy(dvd->vd_lh, bp) != 0) {
999 		zio->io_error = ENXIO;
1000 		zio_interrupt(zio);
1001 	}
1002 }
1003 
1004 static void
1005 vdev_disk_io_done(zio_t *zio)
1006 {
1007 	vdev_t *vd = zio->io_vd;
1008 
1009 	/*
1010 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
1011 	 * the device has been removed.  If this is the case, then we trigger an
1012 	 * asynchronous removal of the device. Otherwise, probe the device and
1013 	 * make sure it's still accessible.
1014 	 */
1015 	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
1016 		vdev_disk_t *dvd = vd->vdev_tsd;
1017 		int state = DKIO_NONE;
1018 
1019 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
1020 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
1021 			/*
1022 			 * We post the resource as soon as possible, instead of
1023 			 * when the async removal actually happens, because the
1024 			 * DE is using this information to discard previous I/O
1025 			 * errors.
1026 			 */
1027 			zfs_post_remove(zio->io_spa, vd);
1028 			vd->vdev_remove_wanted = B_TRUE;
1029 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
1030 		} else if (!vd->vdev_delayed_close) {
1031 			vd->vdev_delayed_close = B_TRUE;
1032 		}
1033 	}
1034 }
1035 
1036 vdev_ops_t vdev_disk_ops = {
1037 	.vdev_op_open = vdev_disk_open,
1038 	.vdev_op_close = vdev_disk_close,
1039 	.vdev_op_asize = vdev_default_asize,
1040 	.vdev_op_io_start = vdev_disk_io_start,
1041 	.vdev_op_io_done = vdev_disk_io_done,
1042 	.vdev_op_state_change = NULL,
1043 	.vdev_op_need_resilver = NULL,
1044 	.vdev_op_hold = vdev_disk_hold,
1045 	.vdev_op_rele = vdev_disk_rele,
1046 	.vdev_op_remap = NULL,
1047 	.vdev_op_xlate = vdev_default_xlate,
1048 	.vdev_op_dumpio = vdev_disk_dumpio,
1049 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
1050 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
1051 };
1052 
1053 /*
1054  * Given the root disk device devid or pathname, read the label from
1055  * the device, and construct a configuration nvlist.
1056  */
1057 int
1058 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
1059 {
1060 	ldi_handle_t vd_lh;
1061 	vdev_label_t *label;
1062 	uint64_t s, size;
1063 	int l;
1064 	ddi_devid_t tmpdevid;
1065 	int error = -1;
1066 	char *minor_name;
1067 
1068 	/*
1069 	 * Read the device label and build the nvlist.
1070 	 */
1071 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
1072 	    &minor_name) == 0) {
1073 		error = ldi_open_by_devid(tmpdevid, minor_name,
1074 		    FREAD, kcred, &vd_lh, zfs_li);
1075 		ddi_devid_free(tmpdevid);
1076 		ddi_devid_str_free(minor_name);
1077 	}
1078 
1079 	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
1080 	    zfs_li)))
1081 		return (error);
1082 
1083 	if (ldi_get_size(vd_lh, &s)) {
1084 		(void) ldi_close(vd_lh, FREAD, kcred);
1085 		return (SET_ERROR(EIO));
1086 	}
1087 
1088 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
1089 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
1090 
1091 	*config = NULL;
1092 	for (l = 0; l < VDEV_LABELS; l++) {
1093 		uint64_t offset, state, txg = 0;
1094 
1095 		/* read vdev label */
1096 		offset = vdev_label_offset(size, l, 0);
1097 		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
1098 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
1099 			continue;
1100 
1101 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
1102 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
1103 			*config = NULL;
1104 			continue;
1105 		}
1106 
1107 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
1108 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
1109 			nvlist_free(*config);
1110 			*config = NULL;
1111 			continue;
1112 		}
1113 
1114 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
1115 		    &txg) != 0 || txg == 0) {
1116 			nvlist_free(*config);
1117 			*config = NULL;
1118 			continue;
1119 		}
1120 
1121 		break;
1122 	}
1123 
1124 	kmem_free(label, sizeof (vdev_label_t));
1125 	(void) ldi_close(vd_lh, FREAD, kcred);
1126 	if (*config == NULL)
1127 		error = SET_ERROR(EIDRM);
1128 
1129 	return (error);
1130 }
1131