1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/fs/zfs.h> 33 34 /* 35 * Virtual device vector for mirroring. 36 */ 37 38 typedef struct mirror_child { 39 vdev_t *mc_vd; 40 uint64_t mc_offset; 41 int mc_error; 42 short mc_tried; 43 short mc_skipped; 44 } mirror_child_t; 45 46 typedef struct mirror_map { 47 int mm_children; 48 int mm_replacing; 49 int mm_preferred; 50 int mm_root; 51 mirror_child_t mm_child[1]; 52 } mirror_map_t; 53 54 static mirror_map_t * 55 vdev_mirror_map_alloc(zio_t *zio) 56 { 57 mirror_map_t *mm = NULL; 58 mirror_child_t *mc; 59 vdev_t *vd = zio->io_vd; 60 int c, d; 61 62 if (vd == NULL) { 63 dva_t *dva = zio->io_bp->blk_dva; 64 spa_t *spa = zio->io_spa; 65 66 c = BP_GET_NDVAS(zio->io_bp); 67 68 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); 69 mm->mm_children = c; 70 mm->mm_replacing = B_FALSE; 71 mm->mm_preferred = spa_get_random(c); 72 mm->mm_root = B_TRUE; 73 74 /* 75 * Check the other, lower-index DVAs to see if they're on 76 * the same vdev as the child we picked. If they are, use 77 * them since they are likely to have been allocated from 78 * the primary metaslab in use at the time, and hence are 79 * more likely to have locality with single-copy data. 80 */ 81 for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { 82 if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) 83 mm->mm_preferred = d; 84 } 85 86 for (c = 0; c < mm->mm_children; c++) { 87 mc = &mm->mm_child[c]; 88 mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); 89 mc->mc_offset = DVA_GET_OFFSET(&dva[c]); 90 } 91 } else { 92 c = vd->vdev_children; 93 94 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); 95 mm->mm_children = c; 96 mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops); 97 mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c); 98 mm->mm_root = B_FALSE; 99 100 for (c = 0; c < mm->mm_children; c++) { 101 mc = &mm->mm_child[c]; 102 mc->mc_vd = vd->vdev_child[c]; 103 mc->mc_offset = zio->io_offset; 104 } 105 } 106 107 zio->io_vsd = mm; 108 return (mm); 109 } 110 111 static void 112 vdev_mirror_map_free(zio_t *zio) 113 { 114 mirror_map_t *mm = zio->io_vsd; 115 116 kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); 117 zio->io_vsd = NULL; 118 } 119 120 static int 121 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 122 { 123 vdev_t *cvd; 124 uint64_t c; 125 int numerrors = 0; 126 int ret, lasterror = 0; 127 128 if (vd->vdev_children == 0) { 129 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 130 return (EINVAL); 131 } 132 133 for (c = 0; c < vd->vdev_children; c++) { 134 cvd = vd->vdev_child[c]; 135 136 if ((ret = vdev_open(cvd)) != 0) { 137 lasterror = ret; 138 numerrors++; 139 continue; 140 } 141 142 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 143 *ashift = MAX(*ashift, cvd->vdev_ashift); 144 } 145 146 if (numerrors == vd->vdev_children) { 147 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 148 return (lasterror); 149 } 150 151 return (0); 152 } 153 154 static void 155 vdev_mirror_close(vdev_t *vd) 156 { 157 uint64_t c; 158 159 for (c = 0; c < vd->vdev_children; c++) 160 vdev_close(vd->vdev_child[c]); 161 } 162 163 static void 164 vdev_mirror_child_done(zio_t *zio) 165 { 166 mirror_child_t *mc = zio->io_private; 167 168 mc->mc_error = zio->io_error; 169 mc->mc_tried = 1; 170 mc->mc_skipped = 0; 171 } 172 173 static void 174 vdev_mirror_scrub_done(zio_t *zio) 175 { 176 mirror_child_t *mc = zio->io_private; 177 178 if (zio->io_error == 0) { 179 zio_t *pio = zio->io_parent; 180 mutex_enter(&pio->io_lock); 181 ASSERT3U(zio->io_size, >=, pio->io_size); 182 bcopy(zio->io_data, pio->io_data, pio->io_size); 183 mutex_exit(&pio->io_lock); 184 } 185 186 zio_buf_free(zio->io_data, zio->io_size); 187 188 mc->mc_error = zio->io_error; 189 mc->mc_tried = 1; 190 mc->mc_skipped = 0; 191 } 192 193 static void 194 vdev_mirror_repair_done(zio_t *zio) 195 { 196 ASSERT(zio->io_private == zio->io_parent); 197 vdev_mirror_map_free(zio->io_private); 198 } 199 200 /* 201 * Try to find a child whose DTL doesn't contain the block we want to read. 202 * If we can't, try the read on any vdev we haven't already tried. 203 */ 204 static int 205 vdev_mirror_child_select(zio_t *zio) 206 { 207 mirror_map_t *mm = zio->io_vsd; 208 mirror_child_t *mc; 209 uint64_t txg = zio->io_txg; 210 int i, c; 211 212 ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); 213 214 /* 215 * Try to find a child whose DTL doesn't contain the block to read. 216 * If a child is known to be completely inaccessible (indicated by 217 * vdev_is_dead() returning B_TRUE), don't even try. 218 */ 219 for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { 220 if (c >= mm->mm_children) 221 c = 0; 222 mc = &mm->mm_child[c]; 223 if (mc->mc_tried || mc->mc_skipped) 224 continue; 225 if (vdev_is_dead(mc->mc_vd)) { 226 mc->mc_error = ENXIO; 227 mc->mc_tried = 1; /* don't even try */ 228 mc->mc_skipped = 1; 229 continue; 230 } 231 if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) 232 return (c); 233 mc->mc_error = ESTALE; 234 mc->mc_skipped = 1; 235 } 236 237 /* 238 * Every device is either missing or has this txg in its DTL. 239 * Look for any child we haven't already tried before giving up. 240 */ 241 for (c = 0; c < mm->mm_children; c++) 242 if (!mm->mm_child[c].mc_tried) 243 return (c); 244 245 /* 246 * Every child failed. There's no place left to look. 247 */ 248 return (-1); 249 } 250 251 static void 252 vdev_mirror_io_start(zio_t *zio) 253 { 254 mirror_map_t *mm; 255 mirror_child_t *mc; 256 int c, children; 257 258 mm = vdev_mirror_map_alloc(zio); 259 260 if (zio->io_type == ZIO_TYPE_READ) { 261 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { 262 /* 263 * For scrubbing reads we need to allocate a read 264 * buffer for each child and issue reads to all 265 * children. If any child succeeds, it will copy its 266 * data into zio->io_data in vdev_mirror_scrub_done. 267 */ 268 for (c = 0; c < mm->mm_children; c++) { 269 mc = &mm->mm_child[c]; 270 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 271 mc->mc_vd, mc->mc_offset, 272 zio_buf_alloc(zio->io_size), zio->io_size, 273 zio->io_type, zio->io_priority, 274 ZIO_FLAG_CANFAIL, 275 vdev_mirror_scrub_done, mc)); 276 } 277 zio_wait_children_done(zio); 278 return; 279 } 280 /* 281 * For normal reads just pick one child. 282 */ 283 c = vdev_mirror_child_select(zio); 284 children = (c >= 0); 285 } else { 286 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 287 288 /* 289 * If this is a resilvering I/O to a replacing vdev, 290 * only the last child should be written -- unless the 291 * first child happens to have a DTL entry here as well. 292 * All other writes go to all children. 293 */ 294 if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && 295 !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, 296 zio->io_txg, 1)) { 297 c = mm->mm_children - 1; 298 children = 1; 299 } else { 300 c = 0; 301 children = mm->mm_children; 302 } 303 } 304 305 while (children--) { 306 mc = &mm->mm_child[c]; 307 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 308 mc->mc_vd, mc->mc_offset, 309 zio->io_data, zio->io_size, zio->io_type, zio->io_priority, 310 ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); 311 c++; 312 } 313 314 zio_wait_children_done(zio); 315 } 316 317 static void 318 vdev_mirror_io_done(zio_t *zio) 319 { 320 mirror_map_t *mm = zio->io_vsd; 321 mirror_child_t *mc; 322 int c; 323 int good_copies = 0; 324 int unexpected_errors = 0; 325 326 zio->io_error = 0; 327 zio->io_numerrors = 0; 328 329 for (c = 0; c < mm->mm_children; c++) { 330 mc = &mm->mm_child[c]; 331 332 if (mc->mc_tried && mc->mc_error == 0) { 333 good_copies++; 334 continue; 335 } 336 337 /* 338 * We preserve any EIOs because those may be worth retrying; 339 * whereas ECKSUM and ENXIO are more likely to be persistent. 340 */ 341 if (mc->mc_error) { 342 if (zio->io_error != EIO) 343 zio->io_error = mc->mc_error; 344 if (!mc->mc_skipped) 345 unexpected_errors++; 346 zio->io_numerrors++; 347 } 348 } 349 350 if (zio->io_type == ZIO_TYPE_WRITE) { 351 /* 352 * XXX -- for now, treat partial writes as success. 353 * XXX -- For a replacing vdev, we need to make sure the 354 * new child succeeds. 355 */ 356 /* XXPOLICY */ 357 if (good_copies != 0) 358 zio->io_error = 0; 359 vdev_mirror_map_free(zio); 360 zio_next_stage(zio); 361 return; 362 } 363 364 ASSERT(zio->io_type == ZIO_TYPE_READ); 365 366 /* 367 * If we don't have a good copy yet, keep trying other children. 368 */ 369 /* XXPOLICY */ 370 if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { 371 ASSERT(c >= 0 && c < mm->mm_children); 372 mc = &mm->mm_child[c]; 373 dprintf("retrying i/o (err=%d) on child %s\n", 374 zio->io_error, vdev_description(mc->mc_vd)); 375 zio->io_error = 0; 376 zio_vdev_io_redone(zio); 377 zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 378 mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, 379 ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, 380 vdev_mirror_child_done, mc)); 381 zio_wait_children_done(zio); 382 return; 383 } 384 385 /* XXPOLICY */ 386 if (good_copies) 387 zio->io_error = 0; 388 else 389 ASSERT(zio->io_error != 0); 390 391 if (good_copies && (spa_mode & FWRITE) && 392 (unexpected_errors || 393 (zio->io_flags & ZIO_FLAG_RESILVER) || 394 ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { 395 zio_t *rio; 396 397 /* 398 * Use the good data we have in hand to repair damaged children. 399 * 400 * We issue all repair I/Os as children of 'rio' to arrange 401 * that vdev_mirror_map_free(zio) will be invoked after all 402 * repairs complete, but before we advance to the next stage. 403 */ 404 rio = zio_null(zio, zio->io_spa, 405 vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL); 406 407 for (c = 0; c < mm->mm_children; c++) { 408 /* 409 * Don't rewrite known good children. 410 * Not only is it unnecessary, it could 411 * actually be harmful: if the system lost 412 * power while rewriting the only good copy, 413 * there would be no good copies left! 414 */ 415 mc = &mm->mm_child[c]; 416 417 if (mc->mc_error == 0) { 418 if (mc->mc_tried) 419 continue; 420 if (!(zio->io_flags & ZIO_FLAG_SCRUB) && 421 !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, 422 zio->io_txg, 1)) 423 continue; 424 mc->mc_error = ESTALE; 425 } 426 427 dprintf("resilvered %s @ 0x%llx error %d\n", 428 vdev_description(mc->mc_vd), mc->mc_offset, 429 mc->mc_error); 430 431 zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd, 432 mc->mc_offset, zio->io_data, zio->io_size, 433 ZIO_TYPE_WRITE, zio->io_priority, 434 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | 435 ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); 436 } 437 438 zio_nowait(rio); 439 zio_wait_children_done(zio); 440 return; 441 } 442 443 vdev_mirror_map_free(zio); 444 zio_next_stage(zio); 445 } 446 447 static void 448 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) 449 { 450 if (faulted == vd->vdev_children) 451 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 452 VDEV_AUX_NO_REPLICAS); 453 else if (degraded + faulted != 0) 454 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 455 else 456 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 457 } 458 459 vdev_ops_t vdev_mirror_ops = { 460 vdev_mirror_open, 461 vdev_mirror_close, 462 vdev_default_asize, 463 vdev_mirror_io_start, 464 vdev_mirror_io_done, 465 vdev_mirror_state_change, 466 VDEV_TYPE_MIRROR, /* name of this vdev type */ 467 B_FALSE /* not a leaf vdev */ 468 }; 469 470 vdev_ops_t vdev_replacing_ops = { 471 vdev_mirror_open, 472 vdev_mirror_close, 473 vdev_default_asize, 474 vdev_mirror_io_start, 475 vdev_mirror_io_done, 476 vdev_mirror_state_change, 477 VDEV_TYPE_REPLACING, /* name of this vdev type */ 478 B_FALSE /* not a leaf vdev */ 479 }; 480