1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_file.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 #include <sys/fs/zfs.h> 34 35 /* 36 * Virtual device vector for files. 37 */ 38 39 static int 40 vdev_file_open_common(vdev_t *vd) 41 { 42 vdev_file_t *vf; 43 vnode_t *vp; 44 int error; 45 46 /* 47 * We must have a pathname, and it must be absolute. 48 */ 49 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 50 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 51 return (EINVAL); 52 } 53 54 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 55 56 /* 57 * We always open the files from the root of the global zone, even if 58 * we're in a local zone. If the user has gotten to this point, the 59 * administrator has already decided that the pool should be available 60 * to local zone users, so the underlying devices should be as well. 61 */ 62 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 63 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 64 spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 65 66 if (error) { 67 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 68 return (error); 69 } 70 71 vf->vf_vnode = vp; 72 73 #ifdef _KERNEL 74 /* 75 * Make sure it's a regular file. 76 */ 77 if (vp->v_type != VREG) { 78 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 79 return (ENODEV); 80 } 81 #endif 82 83 return (0); 84 } 85 86 static int 87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) 88 { 89 vdev_file_t *vf; 90 vattr_t vattr; 91 int error; 92 93 if ((error = vdev_file_open_common(vd)) != 0) 94 return (error); 95 96 vf = vd->vdev_tsd; 97 98 /* 99 * Determine the physical size of the file. 100 */ 101 vattr.va_mask = AT_SIZE; 102 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 103 if (error) { 104 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 105 return (error); 106 } 107 108 *psize = vattr.va_size; 109 *ashift = SPA_MINBLOCKSHIFT; 110 111 return (0); 112 } 113 114 static void 115 vdev_file_close(vdev_t *vd) 116 { 117 vdev_file_t *vf = vd->vdev_tsd; 118 119 if (vf == NULL) 120 return; 121 122 if (vf->vf_vnode != NULL) { 123 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 124 (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); 125 VN_RELE(vf->vf_vnode); 126 } 127 128 kmem_free(vf, sizeof (vdev_file_t)); 129 vd->vdev_tsd = NULL; 130 } 131 132 static int 133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, 134 enum uio_rw rw) 135 { 136 vdev_file_t *vf = vd ? vd->vdev_tsd : NULL; 137 ssize_t resid; 138 int error = 0; 139 140 if (vd == NULL || vf == NULL || vf->vf_vnode == NULL) 141 return (EINVAL); 142 143 ASSERT(rw == UIO_READ || rw == UIO_WRITE); 144 145 error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE, 146 0, RLIM64_INFINITY, kcred, &resid); 147 if (error || resid != 0) 148 return (EIO); 149 return (0); 150 } 151 152 /* 153 * Determine if the underlying device is accessible by reading and writing 154 * to a known location. We must be able to do this during syncing context 155 * and thus we cannot set the vdev state directly. 156 */ 157 static int 158 vdev_file_probe(vdev_t *vd) 159 { 160 vdev_t *nvd; 161 char *vl_boot; 162 uint64_t offset; 163 int l, error = 0, retries = 0; 164 165 if (vd == NULL) 166 return (EINVAL); 167 168 /* Hijack the current vdev */ 169 nvd = vd; 170 171 /* 172 * Pick a random label to rewrite. 173 */ 174 l = spa_get_random(VDEV_LABELS); 175 ASSERT(l < VDEV_LABELS); 176 177 offset = vdev_label_offset(vd->vdev_psize, l, 178 offsetof(vdev_label_t, vl_boot_header)); 179 180 vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP); 181 182 while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 183 offset, UIO_READ)) != 0 && retries == 0) { 184 185 /* 186 * If we failed with the vdev that was passed in then 187 * try allocating a new one and try again. 188 */ 189 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 190 if (vd->vdev_path) 191 nvd->vdev_path = spa_strdup(vd->vdev_path); 192 retries++; 193 194 error = vdev_file_open_common(nvd); 195 if (error) 196 break; 197 } 198 199 if ((spa_mode & FWRITE) && !error) { 200 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 201 offset, UIO_WRITE); 202 } 203 204 if (retries) { 205 vdev_file_close(nvd); 206 if (nvd->vdev_path) 207 spa_strfree(nvd->vdev_path); 208 kmem_free(nvd, sizeof (vdev_t)); 209 } 210 kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE); 211 212 if (!error) 213 vd->vdev_is_failing = B_FALSE; 214 215 return (error); 216 } 217 218 static int 219 vdev_file_io_start(zio_t *zio) 220 { 221 vdev_t *vd = zio->io_vd; 222 vdev_file_t *vf = vd->vdev_tsd; 223 ssize_t resid; 224 int error; 225 226 if (zio->io_type == ZIO_TYPE_IOCTL) { 227 zio_vdev_io_bypass(zio); 228 229 /* XXPOLICY */ 230 if (!vdev_readable(vd)) { 231 zio->io_error = ENXIO; 232 return (ZIO_PIPELINE_CONTINUE); 233 } 234 235 switch (zio->io_cmd) { 236 case DKIOCFLUSHWRITECACHE: 237 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 238 kcred, NULL); 239 dprintf("fsync(%s) = %d\n", vdev_description(vd), 240 zio->io_error); 241 break; 242 default: 243 zio->io_error = ENOTSUP; 244 } 245 246 return (ZIO_PIPELINE_CONTINUE); 247 } 248 249 /* 250 * In the kernel, don't bother double-caching, but in userland, 251 * we want to test the vdev_cache code. 252 */ 253 #ifndef _KERNEL 254 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 255 return (ZIO_PIPELINE_STOP); 256 #endif 257 258 if ((zio = vdev_queue_io(zio)) == NULL) 259 return (ZIO_PIPELINE_STOP); 260 261 /* XXPOLICY */ 262 if (zio->io_type == ZIO_TYPE_WRITE) 263 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 264 else 265 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 266 error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; 267 if (error) { 268 zio->io_error = error; 269 zio_interrupt(zio); 270 return (ZIO_PIPELINE_STOP); 271 } 272 273 zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? 274 UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, 275 zio->io_size, zio->io_offset, UIO_SYSSPACE, 276 0, RLIM64_INFINITY, kcred, &resid); 277 278 if (resid != 0 && zio->io_error == 0) 279 zio->io_error = ENOSPC; 280 281 zio_interrupt(zio); 282 283 return (ZIO_PIPELINE_STOP); 284 } 285 286 static int 287 vdev_file_io_done(zio_t *zio) 288 { 289 vdev_t *vd = zio->io_vd; 290 291 if (zio_injection_enabled && zio->io_error == 0) 292 zio->io_error = zio_handle_device_injection(vd, EIO); 293 294 /* 295 * If an error has been encountered then attempt to probe the device 296 * to determine if it's still accessible. 297 */ 298 if (zio->io_error == EIO && vdev_probe(vd) != 0) 299 vd->vdev_is_failing = B_TRUE; 300 301 vdev_queue_io_done(zio); 302 303 #ifndef _KERNEL 304 if (zio->io_type == ZIO_TYPE_WRITE) 305 vdev_cache_write(zio); 306 #endif 307 308 return (ZIO_PIPELINE_CONTINUE); 309 } 310 311 vdev_ops_t vdev_file_ops = { 312 vdev_file_open, 313 vdev_file_close, 314 vdev_file_probe, 315 vdev_default_asize, 316 vdev_file_io_start, 317 vdev_file_io_done, 318 NULL, 319 VDEV_TYPE_FILE, /* name of this vdev type */ 320 B_TRUE /* leaf vdev */ 321 }; 322 323 /* 324 * From userland we access disks just like files. 325 */ 326 #ifndef _KERNEL 327 328 vdev_ops_t vdev_disk_ops = { 329 vdev_file_open, 330 vdev_file_close, 331 vdev_file_probe, 332 vdev_default_asize, 333 vdev_file_io_start, 334 vdev_file_io_done, 335 NULL, 336 VDEV_TYPE_DISK, /* name of this vdev type */ 337 B_TRUE /* leaf vdev */ 338 }; 339 340 #endif 341