1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
24 * Copyright (c) 2025, Klara, Inc.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_file.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 #include <sys/abd.h>
35 #include <sys/stat.h>
36
37 /*
38 * Virtual device vector for files.
39 */
40
41 static taskq_t *vdev_file_taskq;
42
43 /*
44 * By default, the logical/physical ashift for file vdevs is set to
45 * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
46 * blocksizes. Users may opt to change one or both of these for testing
47 * or performance reasons. Care should be taken as these values will
48 * impact the vdev_ashift setting which can only be set at vdev creation
49 * time.
50 */
51 static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
52 static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
53
54 void
vdev_file_init(void)55 vdev_file_init(void)
56 {
57 vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
58 minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
59
60 VERIFY(vdev_file_taskq);
61 }
62
63 void
vdev_file_fini(void)64 vdev_file_fini(void)
65 {
66 taskq_destroy(vdev_file_taskq);
67 }
68
69 static void
vdev_file_hold(vdev_t * vd)70 vdev_file_hold(vdev_t *vd)
71 {
72 ASSERT3P(vd->vdev_path, !=, NULL);
73 }
74
75 static void
vdev_file_rele(vdev_t * vd)76 vdev_file_rele(vdev_t *vd)
77 {
78 ASSERT3P(vd->vdev_path, !=, NULL);
79 }
80
81 static mode_t
vdev_file_open_mode(spa_mode_t spa_mode)82 vdev_file_open_mode(spa_mode_t spa_mode)
83 {
84 mode_t mode = 0;
85
86 if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
87 mode = O_RDWR;
88 } else if (spa_mode & SPA_MODE_READ) {
89 mode = O_RDONLY;
90 } else if (spa_mode & SPA_MODE_WRITE) {
91 mode = O_WRONLY;
92 }
93
94 return (mode | O_LARGEFILE);
95 }
96
97 static int
vdev_file_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * logical_ashift,uint64_t * physical_ashift)98 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
99 uint64_t *logical_ashift, uint64_t *physical_ashift)
100 {
101 vdev_file_t *vf;
102 zfs_file_t *fp;
103 zfs_file_attr_t zfa;
104 int error;
105
106 /*
107 * Rotational optimizations only make sense on block devices.
108 */
109 vd->vdev_nonrot = B_TRUE;
110
111 /*
112 * Allow TRIM on file based vdevs. This may not always be supported,
113 * since it depends on your kernel version and underlying filesystem
114 * type but it is always safe to attempt.
115 */
116 vd->vdev_has_trim = B_TRUE;
117
118 /*
119 * Disable secure TRIM on file based vdevs. There is no way to
120 * request this behavior from the underlying filesystem.
121 */
122 vd->vdev_has_securetrim = B_FALSE;
123
124 /*
125 * We must have a pathname, and it must be absolute.
126 */
127 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
128 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
129 return (SET_ERROR(EINVAL));
130 }
131
132 /*
133 * Reopen the device if it's not currently open. Otherwise,
134 * just update the physical size of the device.
135 */
136 if (vd->vdev_tsd != NULL) {
137 ASSERT(vd->vdev_reopening);
138 vf = vd->vdev_tsd;
139 goto skip_open;
140 }
141
142 vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
143
144 /*
145 * We always open the files from the root of the global zone, even if
146 * we're in a local zone. If the user has gotten to this point, the
147 * administrator has already decided that the pool should be available
148 * to local zone users, so the underlying devices should be as well.
149 */
150 ASSERT3P(vd->vdev_path, !=, NULL);
151 ASSERT3S(vd->vdev_path[0], ==, '/');
152
153 error = zfs_file_open(vd->vdev_path,
154 vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
155 if (error) {
156 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
157 return (error);
158 }
159
160 vf->vf_file = fp;
161
162 #ifdef _KERNEL
163 /*
164 * Make sure it's a regular file.
165 */
166 if (zfs_file_getattr(fp, &zfa)) {
167 return (SET_ERROR(ENODEV));
168 }
169 if (!S_ISREG(zfa.zfa_mode)) {
170 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
171 return (SET_ERROR(ENODEV));
172 }
173 #endif
174
175 skip_open:
176
177 error = zfs_file_getattr(vf->vf_file, &zfa);
178 if (error) {
179 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
180 return (error);
181 }
182
183 *max_psize = *psize = zfa.zfa_size;
184 *logical_ashift = vdev_file_logical_ashift;
185 *physical_ashift = vdev_file_physical_ashift;
186
187 return (0);
188 }
189
190 static void
vdev_file_close(vdev_t * vd)191 vdev_file_close(vdev_t *vd)
192 {
193 vdev_file_t *vf = vd->vdev_tsd;
194
195 if (vd->vdev_reopening || vf == NULL)
196 return;
197
198 if (vf->vf_file != NULL) {
199 (void) zfs_file_close(vf->vf_file);
200 }
201
202 vd->vdev_delayed_close = B_FALSE;
203 kmem_free(vf, sizeof (vdev_file_t));
204 vd->vdev_tsd = NULL;
205 }
206
207 static void
vdev_file_io_strategy(void * arg)208 vdev_file_io_strategy(void *arg)
209 {
210 zio_t *zio = (zio_t *)arg;
211 vdev_t *vd = zio->io_vd;
212 vdev_file_t *vf = vd->vdev_tsd;
213 void *buf;
214 ssize_t resid;
215 loff_t off;
216 ssize_t size;
217 int err;
218
219 off = zio->io_offset;
220 size = zio->io_size;
221 resid = 0;
222
223 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
224 if (zio->io_type == ZIO_TYPE_READ) {
225 buf = abd_borrow_buf(zio->io_abd, zio->io_size);
226 err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
227 abd_return_buf_copy(zio->io_abd, buf, size);
228 } else {
229 buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
230 err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
231 abd_return_buf(zio->io_abd, buf, size);
232 }
233 zio->io_error = err;
234 if (resid != 0 && zio->io_error == 0)
235 zio->io_error = SET_ERROR(ENOSPC);
236
237 zio_delay_interrupt(zio);
238 }
239
240 static void
vdev_file_io_fsync(void * arg)241 vdev_file_io_fsync(void *arg)
242 {
243 zio_t *zio = (zio_t *)arg;
244 vdev_file_t *vf = zio->io_vd->vdev_tsd;
245
246 zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
247
248 zio_interrupt(zio);
249 }
250
251 static void
vdev_file_io_deallocate(void * arg)252 vdev_file_io_deallocate(void *arg)
253 {
254 zio_t *zio = (zio_t *)arg;
255 vdev_file_t *vf = zio->io_vd->vdev_tsd;
256
257 zio->io_error = zfs_file_deallocate(vf->vf_file,
258 zio->io_offset, zio->io_size);
259
260 zio_interrupt(zio);
261 }
262
263 static void
vdev_file_io_start(zio_t * zio)264 vdev_file_io_start(zio_t *zio)
265 {
266 vdev_t *vd = zio->io_vd;
267
268 if (zio->io_type == ZIO_TYPE_FLUSH) {
269 /* XXPOLICY */
270 if (!vdev_readable(vd)) {
271 zio->io_error = SET_ERROR(ENXIO);
272 zio_interrupt(zio);
273 return;
274 }
275
276 if (zfs_nocacheflush) {
277 zio_interrupt(zio);
278 return;
279 }
280
281 VERIFY3U(taskq_dispatch(vdev_file_taskq,
282 vdev_file_io_fsync, zio, TQ_SLEEP), !=, TASKQID_INVALID);
283
284 return;
285 }
286
287 if (zio->io_type == ZIO_TYPE_TRIM) {
288 ASSERT3U(zio->io_size, !=, 0);
289
290 VERIFY3U(taskq_dispatch(vdev_file_taskq,
291 vdev_file_io_deallocate, zio, TQ_SLEEP), !=,
292 TASKQID_INVALID);
293
294 return;
295 }
296
297 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
298 zio->io_target_timestamp = zio_handle_io_delay(zio);
299
300 VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
301 TQ_SLEEP), !=, TASKQID_INVALID);
302 }
303
304 static void
vdev_file_io_done(zio_t * zio)305 vdev_file_io_done(zio_t *zio)
306 {
307 (void) zio;
308 }
309
310 vdev_ops_t vdev_file_ops = {
311 .vdev_op_init = NULL,
312 .vdev_op_fini = NULL,
313 .vdev_op_open = vdev_file_open,
314 .vdev_op_close = vdev_file_close,
315 .vdev_op_asize = vdev_default_asize,
316 .vdev_op_min_asize = vdev_default_min_asize,
317 .vdev_op_min_alloc = NULL,
318 .vdev_op_io_start = vdev_file_io_start,
319 .vdev_op_io_done = vdev_file_io_done,
320 .vdev_op_state_change = NULL,
321 .vdev_op_need_resilver = NULL,
322 .vdev_op_hold = vdev_file_hold,
323 .vdev_op_rele = vdev_file_rele,
324 .vdev_op_remap = NULL,
325 .vdev_op_xlate = vdev_default_xlate,
326 .vdev_op_rebuild_asize = NULL,
327 .vdev_op_metaslab_init = NULL,
328 .vdev_op_config_generate = NULL,
329 .vdev_op_nparity = NULL,
330 .vdev_op_ndisks = NULL,
331 .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
332 .vdev_op_leaf = B_TRUE /* leaf vdev */
333 };
334
335 /*
336 * From userland we access disks just like files.
337 */
338 #ifndef _KERNEL
339
340 vdev_ops_t vdev_disk_ops = {
341 .vdev_op_init = NULL,
342 .vdev_op_fini = NULL,
343 .vdev_op_open = vdev_file_open,
344 .vdev_op_close = vdev_file_close,
345 .vdev_op_asize = vdev_default_asize,
346 .vdev_op_min_asize = vdev_default_min_asize,
347 .vdev_op_min_alloc = NULL,
348 .vdev_op_io_start = vdev_file_io_start,
349 .vdev_op_io_done = vdev_file_io_done,
350 .vdev_op_state_change = NULL,
351 .vdev_op_need_resilver = NULL,
352 .vdev_op_hold = vdev_file_hold,
353 .vdev_op_rele = vdev_file_rele,
354 .vdev_op_remap = NULL,
355 .vdev_op_xlate = vdev_default_xlate,
356 .vdev_op_rebuild_asize = NULL,
357 .vdev_op_metaslab_init = NULL,
358 .vdev_op_config_generate = NULL,
359 .vdev_op_nparity = NULL,
360 .vdev_op_ndisks = NULL,
361 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
362 .vdev_op_leaf = B_TRUE /* leaf vdev */
363 };
364
365 #endif
366
367 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
368 "Logical ashift for file-based devices");
369 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
370 "Physical ashift for file-based devices");
371