xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_file.c (revision d2a8fad3579763bd288260c8c465ab9eb448d465)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
24  * Copyright (c) 2025, Klara, Inc.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_file.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 #include <sys/abd.h>
35 #include <sys/stat.h>
36 
37 /*
38  * Virtual device vector for files.
39  */
40 
41 static taskq_t *vdev_file_taskq;
42 
43 /*
44  * By default, the logical/physical ashift for file vdevs is set to
45  * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
46  * blocksizes. Users may opt to change one or both of these for testing
47  * or performance reasons. Care should be taken as these values will
48  * impact the vdev_ashift setting which can only be set at vdev creation
49  * time.
50  */
51 static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
52 static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
53 
54 void
vdev_file_init(void)55 vdev_file_init(void)
56 {
57 	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
58 	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
59 
60 	VERIFY(vdev_file_taskq);
61 }
62 
63 void
vdev_file_fini(void)64 vdev_file_fini(void)
65 {
66 	taskq_destroy(vdev_file_taskq);
67 }
68 
69 static void
vdev_file_hold(vdev_t * vd)70 vdev_file_hold(vdev_t *vd)
71 {
72 	ASSERT3P(vd->vdev_path, !=, NULL);
73 }
74 
75 static void
vdev_file_rele(vdev_t * vd)76 vdev_file_rele(vdev_t *vd)
77 {
78 	ASSERT3P(vd->vdev_path, !=, NULL);
79 }
80 
81 static mode_t
vdev_file_open_mode(spa_mode_t spa_mode)82 vdev_file_open_mode(spa_mode_t spa_mode)
83 {
84 	mode_t mode = 0;
85 
86 	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
87 		mode = O_RDWR;
88 	} else if (spa_mode & SPA_MODE_READ) {
89 		mode = O_RDONLY;
90 	} else if (spa_mode & SPA_MODE_WRITE) {
91 		mode = O_WRONLY;
92 	}
93 
94 	return (mode | O_LARGEFILE);
95 }
96 
97 static int
vdev_file_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * logical_ashift,uint64_t * physical_ashift)98 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
99     uint64_t *logical_ashift, uint64_t *physical_ashift)
100 {
101 	vdev_file_t *vf;
102 	zfs_file_t *fp;
103 	zfs_file_attr_t zfa;
104 	int error;
105 
106 	/*
107 	 * Rotational optimizations only make sense on block devices.
108 	 */
109 	vd->vdev_nonrot = B_TRUE;
110 
111 	/*
112 	 * Allow TRIM on file based vdevs.  This may not always be supported,
113 	 * since it depends on your kernel version and underlying filesystem
114 	 * type but it is always safe to attempt.
115 	 */
116 	vd->vdev_has_trim = B_TRUE;
117 
118 	/*
119 	 * Disable secure TRIM on file based vdevs.  There is no way to
120 	 * request this behavior from the underlying filesystem.
121 	 */
122 	vd->vdev_has_securetrim = B_FALSE;
123 
124 	/*
125 	 * We must have a pathname, and it must be absolute.
126 	 */
127 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
128 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
129 		return (SET_ERROR(EINVAL));
130 	}
131 
132 	/*
133 	 * Reopen the device if it's not currently open.  Otherwise,
134 	 * just update the physical size of the device.
135 	 */
136 	if (vd->vdev_tsd != NULL) {
137 		ASSERT(vd->vdev_reopening);
138 		vf = vd->vdev_tsd;
139 		goto skip_open;
140 	}
141 
142 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
143 
144 	/*
145 	 * We always open the files from the root of the global zone, even if
146 	 * we're in a local zone.  If the user has gotten to this point, the
147 	 * administrator has already decided that the pool should be available
148 	 * to local zone users, so the underlying devices should be as well.
149 	 */
150 	ASSERT3P(vd->vdev_path, !=, NULL);
151 	ASSERT3S(vd->vdev_path[0], ==, '/');
152 
153 	error = zfs_file_open(vd->vdev_path,
154 	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
155 	if (error) {
156 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
157 		return (error);
158 	}
159 
160 	vf->vf_file = fp;
161 
162 #ifdef _KERNEL
163 	/*
164 	 * Make sure it's a regular file.
165 	 */
166 	if (zfs_file_getattr(fp, &zfa)) {
167 		return (SET_ERROR(ENODEV));
168 	}
169 	if (!S_ISREG(zfa.zfa_mode)) {
170 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
171 		return (SET_ERROR(ENODEV));
172 	}
173 #endif
174 
175 skip_open:
176 
177 	error =  zfs_file_getattr(vf->vf_file, &zfa);
178 	if (error) {
179 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
180 		return (error);
181 	}
182 
183 	*max_psize = *psize = zfa.zfa_size;
184 	*logical_ashift = vdev_file_logical_ashift;
185 	*physical_ashift = vdev_file_physical_ashift;
186 
187 	return (0);
188 }
189 
190 static void
vdev_file_close(vdev_t * vd)191 vdev_file_close(vdev_t *vd)
192 {
193 	vdev_file_t *vf = vd->vdev_tsd;
194 
195 	if (vd->vdev_reopening || vf == NULL)
196 		return;
197 
198 	if (vf->vf_file != NULL) {
199 		(void) zfs_file_close(vf->vf_file);
200 	}
201 
202 	vd->vdev_delayed_close = B_FALSE;
203 	kmem_free(vf, sizeof (vdev_file_t));
204 	vd->vdev_tsd = NULL;
205 }
206 
207 static void
vdev_file_io_strategy(void * arg)208 vdev_file_io_strategy(void *arg)
209 {
210 	zio_t *zio = (zio_t *)arg;
211 	vdev_t *vd = zio->io_vd;
212 	vdev_file_t *vf = vd->vdev_tsd;
213 	void *buf;
214 	ssize_t resid;
215 	loff_t off;
216 	ssize_t size;
217 	int err;
218 
219 	off = zio->io_offset;
220 	size = zio->io_size;
221 	resid = 0;
222 
223 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
224 	if (zio->io_type == ZIO_TYPE_READ) {
225 		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
226 		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
227 		abd_return_buf_copy(zio->io_abd, buf, size);
228 	} else {
229 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
230 		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
231 		abd_return_buf(zio->io_abd, buf, size);
232 	}
233 	zio->io_error = err;
234 	if (resid != 0 && zio->io_error == 0)
235 		zio->io_error = SET_ERROR(ENOSPC);
236 
237 	zio_delay_interrupt(zio);
238 }
239 
240 static void
vdev_file_io_fsync(void * arg)241 vdev_file_io_fsync(void *arg)
242 {
243 	zio_t *zio = (zio_t *)arg;
244 	vdev_file_t *vf = zio->io_vd->vdev_tsd;
245 
246 	zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
247 
248 	zio_interrupt(zio);
249 }
250 
251 static void
vdev_file_io_deallocate(void * arg)252 vdev_file_io_deallocate(void *arg)
253 {
254 	zio_t *zio = (zio_t *)arg;
255 	vdev_file_t *vf = zio->io_vd->vdev_tsd;
256 
257 	zio->io_error = zfs_file_deallocate(vf->vf_file,
258 	    zio->io_offset, zio->io_size);
259 
260 	zio_interrupt(zio);
261 }
262 
263 static void
vdev_file_io_start(zio_t * zio)264 vdev_file_io_start(zio_t *zio)
265 {
266 	vdev_t *vd = zio->io_vd;
267 
268 	if (zio->io_type == ZIO_TYPE_FLUSH) {
269 		/* XXPOLICY */
270 		if (!vdev_readable(vd)) {
271 			zio->io_error = SET_ERROR(ENXIO);
272 			zio_interrupt(zio);
273 			return;
274 		}
275 
276 		if (zfs_nocacheflush) {
277 			zio_interrupt(zio);
278 			return;
279 		}
280 
281 		VERIFY3U(taskq_dispatch(vdev_file_taskq,
282 		    vdev_file_io_fsync, zio, TQ_SLEEP), !=, TASKQID_INVALID);
283 
284 		return;
285 	}
286 
287 	if (zio->io_type == ZIO_TYPE_TRIM) {
288 		ASSERT3U(zio->io_size, !=, 0);
289 
290 		VERIFY3U(taskq_dispatch(vdev_file_taskq,
291 		    vdev_file_io_deallocate, zio, TQ_SLEEP), !=,
292 		    TASKQID_INVALID);
293 
294 		return;
295 	}
296 
297 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
298 	zio->io_target_timestamp = zio_handle_io_delay(zio);
299 
300 	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
301 	    TQ_SLEEP), !=, TASKQID_INVALID);
302 }
303 
304 static void
vdev_file_io_done(zio_t * zio)305 vdev_file_io_done(zio_t *zio)
306 {
307 	(void) zio;
308 }
309 
310 vdev_ops_t vdev_file_ops = {
311 	.vdev_op_init = NULL,
312 	.vdev_op_fini = NULL,
313 	.vdev_op_open = vdev_file_open,
314 	.vdev_op_close = vdev_file_close,
315 	.vdev_op_asize = vdev_default_asize,
316 	.vdev_op_min_asize = vdev_default_min_asize,
317 	.vdev_op_min_alloc = NULL,
318 	.vdev_op_io_start = vdev_file_io_start,
319 	.vdev_op_io_done = vdev_file_io_done,
320 	.vdev_op_state_change = NULL,
321 	.vdev_op_need_resilver = NULL,
322 	.vdev_op_hold = vdev_file_hold,
323 	.vdev_op_rele = vdev_file_rele,
324 	.vdev_op_remap = NULL,
325 	.vdev_op_xlate = vdev_default_xlate,
326 	.vdev_op_rebuild_asize = NULL,
327 	.vdev_op_metaslab_init = NULL,
328 	.vdev_op_config_generate = NULL,
329 	.vdev_op_nparity = NULL,
330 	.vdev_op_ndisks = NULL,
331 	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
332 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
333 };
334 
335 /*
336  * From userland we access disks just like files.
337  */
338 #ifndef _KERNEL
339 
340 vdev_ops_t vdev_disk_ops = {
341 	.vdev_op_init = NULL,
342 	.vdev_op_fini = NULL,
343 	.vdev_op_open = vdev_file_open,
344 	.vdev_op_close = vdev_file_close,
345 	.vdev_op_asize = vdev_default_asize,
346 	.vdev_op_min_asize = vdev_default_min_asize,
347 	.vdev_op_min_alloc = NULL,
348 	.vdev_op_io_start = vdev_file_io_start,
349 	.vdev_op_io_done = vdev_file_io_done,
350 	.vdev_op_state_change = NULL,
351 	.vdev_op_need_resilver = NULL,
352 	.vdev_op_hold = vdev_file_hold,
353 	.vdev_op_rele = vdev_file_rele,
354 	.vdev_op_remap = NULL,
355 	.vdev_op_xlate = vdev_default_xlate,
356 	.vdev_op_rebuild_asize = NULL,
357 	.vdev_op_metaslab_init = NULL,
358 	.vdev_op_config_generate = NULL,
359 	.vdev_op_nparity = NULL,
360 	.vdev_op_ndisks = NULL,
361 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
362 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
363 };
364 
365 #endif
366 
367 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
368 	"Logical ashift for file-based devices");
369 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
370 	"Physical ashift for file-based devices");
371