xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_file.c (revision 9d5d194537eaddd9cf553f2a5b18fd51a4e74afc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_file.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for files.
37  */
38 
39 static int
40 vdev_file_open_common(vdev_t *vd)
41 {
42 	vdev_file_t *vf;
43 	vnode_t *vp;
44 	int error;
45 
46 	/*
47 	 * We must have a pathname, and it must be absolute.
48 	 */
49 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
50 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
51 		return (EINVAL);
52 	}
53 
54 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
55 
56 	/*
57 	 * We always open the files from the root of the global zone, even if
58 	 * we're in a local zone.  If the user has gotten to this point, the
59 	 * administrator has already decided that the pool should be available
60 	 * to local zone users, so the underlying devices should be as well.
61 	 */
62 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
63 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
64 	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
65 
66 	if (error) {
67 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
68 		return (error);
69 	}
70 
71 	vf->vf_vnode = vp;
72 
73 #ifdef _KERNEL
74 	/*
75 	 * Make sure it's a regular file.
76 	 */
77 	if (vp->v_type != VREG) {
78 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
79 		return (ENODEV);
80 	}
81 #endif
82 
83 	return (0);
84 }
85 
86 static int
87 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
88 {
89 	vdev_file_t *vf;
90 	vattr_t vattr;
91 	int error;
92 
93 	if ((error = vdev_file_open_common(vd)) != 0)
94 		return (error);
95 
96 	vf = vd->vdev_tsd;
97 
98 	/*
99 	 * Determine the physical size of the file.
100 	 */
101 	vattr.va_mask = AT_SIZE;
102 	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
103 	if (error) {
104 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
105 		return (error);
106 	}
107 
108 	*psize = vattr.va_size;
109 	*ashift = SPA_MINBLOCKSHIFT;
110 
111 	return (0);
112 }
113 
114 static void
115 vdev_file_close(vdev_t *vd)
116 {
117 	vdev_file_t *vf = vd->vdev_tsd;
118 
119 	if (vf == NULL)
120 		return;
121 
122 	if (vf->vf_vnode != NULL) {
123 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
124 		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
125 		VN_RELE(vf->vf_vnode);
126 	}
127 
128 	kmem_free(vf, sizeof (vdev_file_t));
129 	vd->vdev_tsd = NULL;
130 }
131 
132 static int
133 vdev_file_probe_io(vdev_t *vd, caddr_t data, size_t size, uint64_t offset,
134     enum uio_rw rw)
135 {
136 	vdev_file_t *vf = vd ? vd->vdev_tsd : NULL;
137 	ssize_t resid;
138 	int error = 0;
139 
140 	if (vd == NULL || vf == NULL || vf->vf_vnode == NULL)
141 		return (EINVAL);
142 
143 	ASSERT(rw == UIO_READ || rw ==  UIO_WRITE);
144 
145 	error = vn_rdwr(rw, vf->vf_vnode, data, size, offset, UIO_SYSSPACE,
146 	    0, RLIM64_INFINITY, kcred, &resid);
147 	if (error || resid != 0)
148 		return (EIO);
149 	return (0);
150 }
151 
152 /*
153  * Determine if the underlying device is accessible by reading and writing
154  * to a known location. We must be able to do this during syncing context
155  * and thus we cannot set the vdev state directly.
156  */
157 static int
158 vdev_file_probe(vdev_t *vd)
159 {
160 	vdev_t *nvd;
161 	char *vl_boot;
162 	uint64_t offset;
163 	int l, error = 0, retries = 0;
164 
165 	if (vd == NULL)
166 		return (EINVAL);
167 
168 	/* Hijack the current vdev */
169 	nvd = vd;
170 
171 	/*
172 	 * Pick a random label to rewrite.
173 	 */
174 	l = spa_get_random(VDEV_LABELS);
175 	ASSERT(l < VDEV_LABELS);
176 
177 	offset = vdev_label_offset(vd->vdev_psize, l,
178 	    offsetof(vdev_label_t, vl_boot_header));
179 
180 	vl_boot = kmem_alloc(VDEV_BOOT_HEADER_SIZE, KM_SLEEP);
181 
182 	while ((error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
183 	    offset, UIO_READ)) != 0 && retries == 0) {
184 
185 		/*
186 		 * If we failed with the vdev that was passed in then
187 		 * try allocating a new one and try again.
188 		 */
189 		nvd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
190 		if (vd->vdev_path)
191 			nvd->vdev_path = spa_strdup(vd->vdev_path);
192 		retries++;
193 
194 		error = vdev_file_open_common(nvd);
195 		if (error)
196 			break;
197 	}
198 
199 	if ((spa_mode & FWRITE) && !error) {
200 		error = vdev_file_probe_io(nvd, vl_boot, VDEV_BOOT_HEADER_SIZE,
201 		    offset, UIO_WRITE);
202 	}
203 
204 	if (retries) {
205 		vdev_file_close(nvd);
206 		if (nvd->vdev_path)
207 			spa_strfree(nvd->vdev_path);
208 		kmem_free(nvd, sizeof (vdev_t));
209 	}
210 	kmem_free(vl_boot, VDEV_BOOT_HEADER_SIZE);
211 
212 	if (!error)
213 		vd->vdev_is_failing = B_FALSE;
214 
215 	return (error);
216 }
217 
218 static int
219 vdev_file_io_start(zio_t *zio)
220 {
221 	vdev_t *vd = zio->io_vd;
222 	vdev_file_t *vf = vd->vdev_tsd;
223 	ssize_t resid;
224 	int error;
225 
226 	if (zio->io_type == ZIO_TYPE_IOCTL) {
227 		zio_vdev_io_bypass(zio);
228 
229 		/* XXPOLICY */
230 		if (!vdev_readable(vd)) {
231 			zio->io_error = ENXIO;
232 			return (ZIO_PIPELINE_CONTINUE);
233 		}
234 
235 		switch (zio->io_cmd) {
236 		case DKIOCFLUSHWRITECACHE:
237 			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
238 			    kcred, NULL);
239 			dprintf("fsync(%s) = %d\n", vdev_description(vd),
240 			    zio->io_error);
241 			break;
242 		default:
243 			zio->io_error = ENOTSUP;
244 		}
245 
246 		return (ZIO_PIPELINE_CONTINUE);
247 	}
248 
249 	/*
250 	 * In the kernel, don't bother double-caching, but in userland,
251 	 * we want to test the vdev_cache code.
252 	 */
253 #ifndef _KERNEL
254 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
255 		return (ZIO_PIPELINE_STOP);
256 #endif
257 
258 	if ((zio = vdev_queue_io(zio)) == NULL)
259 		return (ZIO_PIPELINE_STOP);
260 
261 	/* XXPOLICY */
262 	if (zio->io_type == ZIO_TYPE_WRITE)
263 		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
264 	else
265 		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
266 	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
267 	if (error) {
268 		zio->io_error = error;
269 		zio_interrupt(zio);
270 		return (ZIO_PIPELINE_STOP);
271 	}
272 
273 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
274 	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
275 	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
276 	    0, RLIM64_INFINITY, kcred, &resid);
277 
278 	if (resid != 0 && zio->io_error == 0)
279 		zio->io_error = ENOSPC;
280 
281 	zio_interrupt(zio);
282 
283 	return (ZIO_PIPELINE_STOP);
284 }
285 
286 static int
287 vdev_file_io_done(zio_t *zio)
288 {
289 	vdev_t *vd = zio->io_vd;
290 
291 	if (zio_injection_enabled && zio->io_error == 0)
292 		zio->io_error = zio_handle_device_injection(vd, EIO);
293 
294 	/*
295 	 * If an error has been encountered then attempt to probe the device
296 	 * to determine if it's still accessible.
297 	 */
298 	if (zio->io_error == EIO && vdev_probe(vd) != 0)
299 		vd->vdev_is_failing = B_TRUE;
300 
301 	vdev_queue_io_done(zio);
302 
303 #ifndef _KERNEL
304 	if (zio->io_type == ZIO_TYPE_WRITE)
305 		vdev_cache_write(zio);
306 #endif
307 
308 	return (ZIO_PIPELINE_CONTINUE);
309 }
310 
311 vdev_ops_t vdev_file_ops = {
312 	vdev_file_open,
313 	vdev_file_close,
314 	vdev_file_probe,
315 	vdev_default_asize,
316 	vdev_file_io_start,
317 	vdev_file_io_done,
318 	NULL,
319 	VDEV_TYPE_FILE,		/* name of this vdev type */
320 	B_TRUE			/* leaf vdev */
321 };
322 
323 /*
324  * From userland we access disks just like files.
325  */
326 #ifndef _KERNEL
327 
328 vdev_ops_t vdev_disk_ops = {
329 	vdev_file_open,
330 	vdev_file_close,
331 	vdev_file_probe,
332 	vdev_default_asize,
333 	vdev_file_io_start,
334 	vdev_file_io_done,
335 	NULL,
336 	VDEV_TYPE_DISK,		/* name of this vdev type */
337 	B_TRUE			/* leaf vdev */
338 };
339 
340 #endif
341