1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_file.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 #include <sys/fs/zfs.h> 34 35 /* 36 * Virtual device vector for files. 37 */ 38 39 static int 40 vdev_file_open_common(vdev_t *vd) 41 { 42 vdev_file_t *vf; 43 vnode_t *vp; 44 int error; 45 46 /* 47 * We must have a pathname, and it must be absolute. 48 */ 49 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 50 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 51 return (EINVAL); 52 } 53 54 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 55 56 /* 57 * We always open the files from the root of the global zone, even if 58 * we're in a local zone. If the user has gotten to this point, the 59 * administrator has already decided that the pool should be available 60 * to local zone users, so the underlying devices should be as well. 61 */ 62 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 63 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 64 spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 65 66 if (error) { 67 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 68 return (error); 69 } 70 71 vf->vf_vnode = vp; 72 73 #ifdef _KERNEL 74 /* 75 * Make sure it's a regular file. 76 */ 77 if (vp->v_type != VREG) { 78 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 79 return (ENODEV); 80 } 81 #endif 82 83 return (0); 84 } 85 86 static int 87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) 88 { 89 vdev_file_t *vf; 90 vattr_t vattr; 91 int error; 92 93 if ((error = vdev_file_open_common(vd)) != 0) 94 return (error); 95 96 vf = vd->vdev_tsd; 97 98 /* 99 * Determine the physical size of the file. 100 */ 101 vattr.va_mask = AT_SIZE; 102 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 103 if (error) { 104 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 105 return (error); 106 } 107 108 *psize = vattr.va_size; 109 *ashift = SPA_MINBLOCKSHIFT; 110 111 return (0); 112 } 113 114 static void 115 vdev_file_close(vdev_t *vd) 116 { 117 vdev_file_t *vf = vd->vdev_tsd; 118 119 if (vf == NULL) 120 return; 121 122 if (vf->vf_vnode != NULL) { 123 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 124 (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); 125 VN_RELE(vf->vf_vnode); 126 } 127 128 kmem_free(vf, sizeof (vdev_file_t)); 129 vd->vdev_tsd = NULL; 130 } 131 132 static int 133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, 134 enum uio_rw rw) 135 { 136 vdev_file_t *vf = vd->vdev_tsd; 137 ssize_t resid; 138 int error = 0; 139 140 if (vd == NULL || vf == NULL || vf->vf_vnode == NULL) 141 return (EINVAL); 142 143 ASSERT(rw == UIO_READ || rw == UIO_WRITE); 144 145 error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE, 146 0, RLIM64_INFINITY, kcred, &resid); 147 if (error || resid != 0) 148 return (EIO); 149 return (0); 150 } 151 152 /* 153 * Determine if the underlying device is accessible by reading and writing 154 * to a known location. We must be able to do this during syncing context 155 * and thus we cannot set the vdev state directly. 156 */ 157 static int 158 vdev_file_probe(vdev_t *vd) 159 { 160 vdev_t *nvd; 161 char *vl_boot; 162 uint64_t offset; 163 int l, error = 0, retries = 0; 164 165 if (vd == NULL) 166 return (EINVAL); 167 168 /* Hijack the current vdev */ 169 nvd = vd; 170 171 /* 172 * Pick a random label to rewrite. 173 */ 174 l = spa_get_random(VDEV_LABELS); 175 ASSERT(l < VDEV_LABELS); 176 177 offset = vdev_label_offset(vd->vdev_psize, l, 178 offsetof(vdev_label_t, vl_boot_header)); 179 180 vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP); 181 182 while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 183 offset, UIO_READ)) != 0 && retries == 0) { 184 185 /* 186 * If we failed with the vdev that was passed in then 187 * try allocating a new one and try again. 188 */ 189 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 190 if (vd->vdev_path) 191 nvd->vdev_path = spa_strdup(vd->vdev_path); 192 retries++; 193 194 error = vdev_file_open_common(nvd); 195 if (error) 196 break; 197 } 198 199 if ((spa_mode & FWRITE) && !error) { 200 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 201 offset, UIO_WRITE); 202 } 203 204 if (retries) { 205 vdev_file_close(nvd); 206 if (nvd->vdev_path) 207 spa_strfree(nvd->vdev_path); 208 kmem_free(nvd, sizeof (vdev_t)); 209 } 210 kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE); 211 212 if (!error) 213 vd->vdev_is_failing = B_FALSE; 214 215 return (error); 216 } 217 218 static void 219 vdev_file_io_start(zio_t *zio) 220 { 221 vdev_t *vd = zio->io_vd; 222 vdev_file_t *vf = vd->vdev_tsd; 223 ssize_t resid; 224 int error; 225 226 if (zio->io_type == ZIO_TYPE_IOCTL) { 227 zio_vdev_io_bypass(zio); 228 229 /* XXPOLICY */ 230 if (!vdev_readable(vd)) { 231 zio->io_error = ENXIO; 232 zio_next_stage_async(zio); 233 return; 234 } 235 236 switch (zio->io_cmd) { 237 case DKIOCFLUSHWRITECACHE: 238 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 239 kcred, NULL); 240 dprintf("fsync(%s) = %d\n", vdev_description(vd), 241 zio->io_error); 242 break; 243 default: 244 zio->io_error = ENOTSUP; 245 } 246 247 zio_next_stage_async(zio); 248 return; 249 } 250 251 /* 252 * In the kernel, don't bother double-caching, but in userland, 253 * we want to test the vdev_cache code. 254 */ 255 #ifndef _KERNEL 256 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 257 return; 258 #endif 259 260 if ((zio = vdev_queue_io(zio)) == NULL) 261 return; 262 263 /* XXPOLICY */ 264 if (zio->io_type == ZIO_TYPE_WRITE) 265 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 266 else 267 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 268 error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; 269 if (error) { 270 zio->io_error = error; 271 zio_next_stage_async(zio); 272 return; 273 } 274 275 zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? 276 UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, 277 zio->io_size, zio->io_offset, UIO_SYSSPACE, 278 0, RLIM64_INFINITY, kcred, &resid); 279 280 if (resid != 0 && zio->io_error == 0) 281 zio->io_error = ENOSPC; 282 283 zio_next_stage_async(zio); 284 } 285 286 static void 287 vdev_file_io_done(zio_t *zio) 288 { 289 290 if (zio_injection_enabled && zio->io_error == 0) 291 zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); 292 293 /* 294 * If an error has been encountered then attempt to probe the device 295 * to determine if it's still accessible. 296 */ 297 if (zio->io_error == EIO) { 298 vdev_t *vd = zio->io_vd; 299 300 if (vdev_probe(vd) != 0) 301 vd->vdev_is_failing = B_TRUE; 302 } 303 304 vdev_queue_io_done(zio); 305 306 #ifndef _KERNEL 307 if (zio->io_type == ZIO_TYPE_WRITE) 308 vdev_cache_write(zio); 309 #endif 310 311 zio_next_stage(zio); 312 } 313 314 vdev_ops_t vdev_file_ops = { 315 vdev_file_open, 316 vdev_file_close, 317 vdev_file_probe, 318 vdev_default_asize, 319 vdev_file_io_start, 320 vdev_file_io_done, 321 NULL, 322 VDEV_TYPE_FILE, /* name of this vdev type */ 323 B_TRUE /* leaf vdev */ 324 }; 325 326 /* 327 * From userland we access disks just like files. 328 */ 329 #ifndef _KERNEL 330 331 vdev_ops_t vdev_disk_ops = { 332 vdev_file_open, 333 vdev_file_close, 334 vdev_file_probe, 335 vdev_default_asize, 336 vdev_file_io_start, 337 vdev_file_io_done, 338 NULL, 339 VDEV_TYPE_DISK, /* name of this vdev type */ 340 B_TRUE /* leaf vdev */ 341 }; 342 343 #endif 344