1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_file.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 #include <sys/fs/zfs.h> 34 #include <sys/fm/fs/zfs.h> 35 36 /* 37 * Virtual device vector for files. 38 */ 39 40 static int 41 vdev_file_open_common(vdev_t *vd) 42 { 43 vdev_file_t *vf; 44 vnode_t *vp; 45 int error; 46 47 /* 48 * We must have a pathname, and it must be absolute. 49 */ 50 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 51 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 52 return (EINVAL); 53 } 54 55 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 56 57 /* 58 * We always open the files from the root of the global zone, even if 59 * we're in a local zone. If the user has gotten to this point, the 60 * administrator has already decided that the pool should be available 61 * to local zone users, so the underlying devices should be as well. 62 */ 63 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 64 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 65 spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 66 67 if (error) { 68 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 69 return (error); 70 } 71 72 vf->vf_vnode = vp; 73 74 #ifdef _KERNEL 75 /* 76 * Make sure it's a regular file. 77 */ 78 if (vp->v_type != VREG) { 79 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 80 return (ENODEV); 81 } 82 #endif 83 84 return (0); 85 } 86 87 static int 88 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) 89 { 90 vdev_file_t *vf; 91 vattr_t vattr; 92 int error; 93 94 if ((error = vdev_file_open_common(vd)) != 0) 95 return (error); 96 97 vf = vd->vdev_tsd; 98 99 /* 100 * Determine the physical size of the file. 101 */ 102 vattr.va_mask = AT_SIZE; 103 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 104 if (error) { 105 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 106 return (error); 107 } 108 109 *psize = vattr.va_size; 110 *ashift = SPA_MINBLOCKSHIFT; 111 112 return (0); 113 } 114 115 static void 116 vdev_file_close(vdev_t *vd) 117 { 118 vdev_file_t *vf = vd->vdev_tsd; 119 120 if (vf == NULL) 121 return; 122 123 if (vf->vf_vnode != NULL) { 124 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 125 (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); 126 VN_RELE(vf->vf_vnode); 127 } 128 129 kmem_free(vf, sizeof (vdev_file_t)); 130 vd->vdev_tsd = NULL; 131 } 132 133 static int 134 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, 135 enum uio_rw rw) 136 { 137 vdev_file_t *vf = vd ? vd->vdev_tsd : NULL; 138 ssize_t resid; 139 int error = 0; 140 141 if (vd == NULL || vf == NULL || vf->vf_vnode == NULL) 142 return (EINVAL); 143 144 ASSERT(rw == UIO_READ || rw == UIO_WRITE); 145 146 error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE, 147 0, RLIM64_INFINITY, kcred, &resid); 148 149 if (error || resid != 0) 150 return (EIO); 151 152 if (zio_injection_enabled) 153 error = zio_handle_device_injection(vd, EIO); 154 155 return (error); 156 } 157 158 /* 159 * Determine if the underlying device is accessible by reading and writing 160 * to a known location. We must be able to do this during syncing context 161 * and thus we cannot set the vdev state directly. 162 */ 163 static int 164 vdev_file_probe(vdev_t *vd) 165 { 166 vdev_t *nvd; 167 char *vl_boot; 168 uint64_t offset; 169 int l, error = 0, retries = 0; 170 171 if (vd == NULL) 172 return (EINVAL); 173 174 /* Hijack the current vdev */ 175 nvd = vd; 176 177 /* 178 * Pick a random label to rewrite. 179 */ 180 l = spa_get_random(VDEV_LABELS); 181 ASSERT(l < VDEV_LABELS); 182 183 offset = vdev_label_offset(vd->vdev_psize, l, 184 offsetof(vdev_label_t, vl_boot_header)); 185 186 vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP); 187 188 while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 189 offset, UIO_READ)) != 0 && retries == 0) { 190 191 /* 192 * If we failed with the vdev that was passed in then 193 * try allocating a new one and try again. 194 */ 195 nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 196 if (vd->vdev_path) 197 nvd->vdev_path = spa_strdup(vd->vdev_path); 198 nvd->vdev_guid = vd->vdev_guid; 199 retries++; 200 201 if (vdev_file_open_common(nvd) != 0) 202 break; 203 } 204 205 if ((spa_mode & FWRITE) && !error) { 206 error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE, 207 offset, UIO_WRITE); 208 } 209 210 if (retries) { 211 vdev_file_close(nvd); 212 if (nvd->vdev_path) 213 spa_strfree(nvd->vdev_path); 214 kmem_free(nvd, sizeof (vdev_t)); 215 } 216 kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE); 217 218 if (!error) 219 vd->vdev_is_failing = B_FALSE; 220 221 return (error); 222 } 223 224 static int 225 vdev_file_io_start(zio_t *zio) 226 { 227 vdev_t *vd = zio->io_vd; 228 vdev_file_t *vf = vd->vdev_tsd; 229 ssize_t resid; 230 int error; 231 232 if (zio->io_type == ZIO_TYPE_IOCTL) { 233 zio_vdev_io_bypass(zio); 234 235 /* XXPOLICY */ 236 if (!vdev_readable(vd)) { 237 zio->io_error = ENXIO; 238 return (ZIO_PIPELINE_CONTINUE); 239 } 240 241 switch (zio->io_cmd) { 242 case DKIOCFLUSHWRITECACHE: 243 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 244 kcred, NULL); 245 dprintf("fsync(%s) = %d\n", vdev_description(vd), 246 zio->io_error); 247 break; 248 default: 249 zio->io_error = ENOTSUP; 250 } 251 252 return (ZIO_PIPELINE_CONTINUE); 253 } 254 255 /* 256 * In the kernel, don't bother double-caching, but in userland, 257 * we want to test the vdev_cache code. 258 */ 259 #ifndef _KERNEL 260 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 261 return (ZIO_PIPELINE_STOP); 262 #endif 263 264 if ((zio = vdev_queue_io(zio)) == NULL) 265 return (ZIO_PIPELINE_STOP); 266 267 /* XXPOLICY */ 268 if (zio->io_type == ZIO_TYPE_WRITE) 269 error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 270 else 271 error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; 272 error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; 273 if (error) { 274 zio->io_error = error; 275 zio_interrupt(zio); 276 return (ZIO_PIPELINE_STOP); 277 } 278 279 zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? 280 UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, 281 zio->io_size, zio->io_offset, UIO_SYSSPACE, 282 0, RLIM64_INFINITY, kcred, &resid); 283 284 if (resid != 0 && zio->io_error == 0) 285 zio->io_error = ENOSPC; 286 287 zio_interrupt(zio); 288 289 return (ZIO_PIPELINE_STOP); 290 } 291 292 static int 293 vdev_file_io_done(zio_t *zio) 294 { 295 vdev_t *vd = zio->io_vd; 296 297 if (zio_injection_enabled && zio->io_error == 0) 298 zio->io_error = zio_handle_device_injection(vd, EIO); 299 300 /* 301 * If an error has been encountered then attempt to probe the device 302 * to determine if it's still accessible. 303 */ 304 if (zio->io_error == EIO && vdev_probe(vd) != 0) { 305 if (!vd->vdev_is_failing) { 306 vd->vdev_is_failing = B_TRUE; 307 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 308 vd->vdev_spa, vd, zio, 0, 0); 309 } 310 } 311 312 vdev_queue_io_done(zio); 313 314 #ifndef _KERNEL 315 if (zio->io_type == ZIO_TYPE_WRITE) 316 vdev_cache_write(zio); 317 #endif 318 319 return (ZIO_PIPELINE_CONTINUE); 320 } 321 322 vdev_ops_t vdev_file_ops = { 323 vdev_file_open, 324 vdev_file_close, 325 vdev_file_probe, 326 vdev_default_asize, 327 vdev_file_io_start, 328 vdev_file_io_done, 329 NULL, 330 VDEV_TYPE_FILE, /* name of this vdev type */ 331 B_TRUE /* leaf vdev */ 332 }; 333 334 /* 335 * From userland we access disks just like files. 336 */ 337 #ifndef _KERNEL 338 339 vdev_ops_t vdev_disk_ops = { 340 vdev_file_open, 341 vdev_file_close, 342 vdev_file_probe, 343 vdev_default_asize, 344 vdev_file_io_start, 345 vdev_file_io_done, 346 NULL, 347 VDEV_TYPE_DISK, /* name of this vdev type */ 348 B_TRUE /* leaf vdev */ 349 }; 350 351 #endif 352