1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/spa_impl.h> 30 #include <sys/vdev_file.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/vdev_trim.h> 33 #include <sys/zio.h> 34 #include <sys/fs/zfs.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/abd.h> 37 #include <sys/fcntl.h> 38 #include <sys/vnode.h> 39 40 /* 41 * Virtual device vector for files. 42 */ 43 44 static void 45 vdev_file_hold(vdev_t *vd) 46 { 47 ASSERT(vd->vdev_path != NULL); 48 } 49 50 static void 51 vdev_file_rele(vdev_t *vd) 52 { 53 ASSERT(vd->vdev_path != NULL); 54 } 55 56 static int 57 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 58 uint64_t *ashift) 59 { 60 vdev_file_t *vf; 61 vnode_t *vp; 62 vattr_t vattr; 63 int error; 64 65 /* 66 * Rotational optimizations only make sense on block devices. 67 */ 68 vd->vdev_nonrot = B_TRUE; 69 70 /* 71 * Allow TRIM on file based vdevs. This may not always be supported, 72 * since it depends on your kernel version and underlying filesystem 73 * type but it is always safe to attempt. 74 */ 75 vd->vdev_has_trim = B_TRUE; 76 77 /* 78 * Disable secure TRIM on file based vdevs. There is no way to 79 * request this behavior from the underlying filesystem. 80 */ 81 vd->vdev_has_securetrim = B_FALSE; 82 83 /* 84 * We must have a pathname, and it must be absolute. 85 */ 86 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 87 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 88 return (SET_ERROR(EINVAL)); 89 } 90 91 /* 92 * Reopen the device if it's not currently open. Otherwise, 93 * just update the physical size of the device. 94 */ 95 if (vd->vdev_tsd != NULL) { 96 ASSERT(vd->vdev_reopening); 97 vf = vd->vdev_tsd; 98 goto skip_open; 99 } 100 101 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); 102 103 /* 104 * We always open the files from the root of the global zone, even if 105 * we're in a local zone. If the user has gotten to this point, the 106 * administrator has already decided that the pool should be available 107 * to local zone users, so the underlying devices should be as well. 108 */ 109 ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); 110 error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, 111 spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); 112 113 if (error) { 114 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 115 return (error); 116 } 117 118 vf->vf_vnode = vp; 119 120 #ifdef _KERNEL 121 /* 122 * Make sure it's a regular file. 123 */ 124 if (vp->v_type != VREG) { 125 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 126 return (SET_ERROR(ENODEV)); 127 } 128 #endif 129 130 skip_open: 131 /* 132 * Determine the physical size of the file. 133 */ 134 vattr.va_mask = AT_SIZE; 135 error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); 136 if (error) { 137 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 138 return (error); 139 } 140 141 *max_psize = *psize = vattr.va_size; 142 *ashift = SPA_MINBLOCKSHIFT; 143 144 return (0); 145 } 146 147 static void 148 vdev_file_close(vdev_t *vd) 149 { 150 vdev_file_t *vf = vd->vdev_tsd; 151 152 if (vd->vdev_reopening || vf == NULL) 153 return; 154 155 if (vf->vf_vnode != NULL) { 156 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); 157 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, 158 kcred, NULL); 159 VN_RELE(vf->vf_vnode); 160 } 161 162 vd->vdev_delayed_close = B_FALSE; 163 kmem_free(vf, sizeof (vdev_file_t)); 164 vd->vdev_tsd = NULL; 165 } 166 167 /* 168 * Implements the interrupt side for file vdev types. This routine will be 169 * called when the I/O completes allowing us to transfer the I/O to the 170 * interrupt taskqs. For consistency, the code structure mimics disk vdev 171 * types. 172 */ 173 static int 174 vdev_file_io_intr(buf_t *bp) 175 { 176 vdev_buf_t *vb = (vdev_buf_t *)bp; 177 zio_t *zio = vb->vb_io; 178 179 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 180 if (zio->io_error == 0 && bp->b_resid != 0) 181 zio->io_error = SET_ERROR(ENOSPC); 182 183 if (zio->io_type == ZIO_TYPE_READ) { 184 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 185 } else { 186 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 187 } 188 189 kmem_free(vb, sizeof (vdev_buf_t)); 190 zio_delay_interrupt(zio); 191 return (0); 192 } 193 194 static void 195 vdev_file_io_strategy(void *arg) 196 { 197 buf_t *bp = arg; 198 vnode_t *vp = bp->b_private; 199 ssize_t resid; 200 int error; 201 202 error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, 203 vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), 204 UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 205 206 if (error == 0) { 207 bp->b_resid = resid; 208 biodone(bp); 209 } else { 210 bioerror(bp, error); 211 biodone(bp); 212 } 213 } 214 215 static void 216 vdev_file_io_start(zio_t *zio) 217 { 218 vdev_t *vd = zio->io_vd; 219 vdev_file_t *vf = vd->vdev_tsd; 220 vdev_buf_t *vb; 221 buf_t *bp; 222 223 if (zio->io_type == ZIO_TYPE_IOCTL) { 224 /* XXPOLICY */ 225 if (!vdev_readable(vd)) { 226 zio->io_error = SET_ERROR(ENXIO); 227 zio_interrupt(zio); 228 return; 229 } 230 231 switch (zio->io_cmd) { 232 case DKIOCFLUSHWRITECACHE: 233 zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, 234 kcred, NULL); 235 break; 236 default: 237 zio->io_error = SET_ERROR(ENOTSUP); 238 } 239 240 zio_execute(zio); 241 return; 242 } else if (zio->io_type == ZIO_TYPE_TRIM) { 243 struct flock64 flck; 244 245 ASSERT3U(zio->io_size, !=, 0); 246 bzero(&flck, sizeof (flck)); 247 flck.l_type = F_FREESP; 248 flck.l_start = zio->io_offset; 249 flck.l_len = zio->io_size; 250 flck.l_whence = 0; 251 252 zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck, 253 0, 0, kcred, NULL); 254 255 zio_execute(zio); 256 return; 257 } 258 259 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 260 zio->io_target_timestamp = zio_handle_io_delay(zio); 261 262 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 263 264 vb->vb_io = zio; 265 bp = &vb->vb_buf; 266 267 bioinit(bp); 268 bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 269 bp->b_bcount = zio->io_size; 270 271 if (zio->io_type == ZIO_TYPE_READ) { 272 bp->b_un.b_addr = 273 abd_borrow_buf(zio->io_abd, zio->io_size); 274 } else { 275 bp->b_un.b_addr = 276 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 277 } 278 279 bp->b_lblkno = lbtodb(zio->io_offset); 280 bp->b_bufsize = zio->io_size; 281 bp->b_private = vf->vf_vnode; 282 bp->b_iodone = vdev_file_io_intr; 283 284 VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp, 285 TQ_SLEEP), !=, TASKQID_INVALID); 286 } 287 288 /* ARGSUSED */ 289 static void 290 vdev_file_io_done(zio_t *zio) 291 { 292 } 293 294 vdev_ops_t vdev_file_ops = { 295 .vdev_op_open = vdev_file_open, 296 .vdev_op_close = vdev_file_close, 297 .vdev_op_asize = vdev_default_asize, 298 .vdev_op_io_start = vdev_file_io_start, 299 .vdev_op_io_done = vdev_file_io_done, 300 .vdev_op_state_change = NULL, 301 .vdev_op_need_resilver = NULL, 302 .vdev_op_hold = vdev_file_hold, 303 .vdev_op_rele = vdev_file_rele, 304 .vdev_op_remap = NULL, 305 .vdev_op_xlate = vdev_default_xlate, 306 .vdev_op_dumpio = NULL, 307 .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ 308 .vdev_op_leaf = B_TRUE /* leaf vdev */ 309 }; 310 311 /* 312 * From userland we access disks just like files. 313 */ 314 #ifndef _KERNEL 315 316 vdev_ops_t vdev_disk_ops = { 317 .vdev_op_open = vdev_file_open, 318 .vdev_op_close = vdev_file_close, 319 .vdev_op_asize = vdev_default_asize, 320 .vdev_op_io_start = vdev_file_io_start, 321 .vdev_op_io_done = vdev_file_io_done, 322 .vdev_op_state_change = NULL, 323 .vdev_op_need_resilver = NULL, 324 .vdev_op_hold = vdev_file_hold, 325 .vdev_op_rele = vdev_file_rele, 326 .vdev_op_remap = NULL, 327 .vdev_op_xlate = vdev_default_xlate, 328 .vdev_op_dumpio = NULL, 329 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 330 .vdev_op_leaf = B_TRUE /* leaf vdev */ 331 }; 332 333 #endif 334