xref: /illumos-gate/usr/src/lib/libzfs_core/common/libzfs_core.c (revision f7e4f33f27df122daef98e5b4b537dc159597da5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Steven Hartland. All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  * Copyright 2017 RackTop Systems.
27  * Copyright (c) 2017 Datto Inc.
28  */
29 
30 /*
31  * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
32  * It has the following characteristics:
33  *
34  *  - Thread Safe.  libzfs_core is accessible concurrently from multiple
35  *  threads.  This is accomplished primarily by avoiding global data
36  *  (e.g. caching).  Since it's thread-safe, there is no reason for a
37  *  process to have multiple libzfs "instances".  Therefore, we store
38  *  our few pieces of data (e.g. the file descriptor) in global
39  *  variables.  The fd is reference-counted so that the libzfs_core
40  *  library can be "initialized" multiple times (e.g. by different
41  *  consumers within the same process).
42  *
43  *  - Committed Interface.  The libzfs_core interface will be committed,
44  *  therefore consumers can compile against it and be confident that
45  *  their code will continue to work on future releases of this code.
46  *  Currently, the interface is Evolving (not Committed), but we intend
47  *  to commit to it once it is more complete and we determine that it
48  *  meets the needs of all consumers.
49  *
50  *  - Programatic Error Handling.  libzfs_core communicates errors with
51  *  defined error numbers, and doesn't print anything to stdout/stderr.
52  *
53  *  - Thin Layer.  libzfs_core is a thin layer, marshaling arguments
54  *  to/from the kernel ioctls.  There is generally a 1:1 correspondence
55  *  between libzfs_core functions and ioctls to /dev/zfs.
56  *
57  *  - Clear Atomicity.  Because libzfs_core functions are generally 1:1
58  *  with kernel ioctls, and kernel ioctls are general atomic, each
59  *  libzfs_core function is atomic.  For example, creating multiple
60  *  snapshots with a single call to lzc_snapshot() is atomic -- it
61  *  can't fail with only some of the requested snapshots created, even
62  *  in the event of power loss or system crash.
63  *
64  *  - Continued libzfs Support.  Some higher-level operations (e.g.
65  *  support for "zfs send -R") are too complicated to fit the scope of
66  *  libzfs_core.  This functionality will continue to live in libzfs.
67  *  Where appropriate, libzfs will use the underlying atomic operations
68  *  of libzfs_core.  For example, libzfs may implement "zfs send -R |
69  *  zfs receive" by using individual "send one snapshot", rename,
70  *  destroy, and "receive one snapshot" operations in libzfs_core.
71  *  /sbin/zfs and /zbin/zpool will link with both libzfs and
72  *  libzfs_core.  Other consumers should aim to use only libzfs_core,
73  *  since that will be the supported, stable interface going forwards.
74  */
75 
76 #include <libzfs_core.h>
77 #include <ctype.h>
78 #include <unistd.h>
79 #include <stdlib.h>
80 #include <string.h>
81 #include <errno.h>
82 #include <fcntl.h>
83 #include <pthread.h>
84 #include <sys/nvpair.h>
85 #include <sys/param.h>
86 #include <sys/types.h>
87 #include <sys/stat.h>
88 #include <sys/zfs_ioctl.h>
89 
90 static int g_fd = -1;
91 static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
92 static int g_refcount;
93 
94 int
95 libzfs_core_init(void)
96 {
97 	(void) pthread_mutex_lock(&g_lock);
98 	if (g_refcount == 0) {
99 		g_fd = open("/dev/zfs", O_RDWR);
100 		if (g_fd < 0) {
101 			(void) pthread_mutex_unlock(&g_lock);
102 			return (errno);
103 		}
104 	}
105 	g_refcount++;
106 	(void) pthread_mutex_unlock(&g_lock);
107 	return (0);
108 }
109 
110 void
111 libzfs_core_fini(void)
112 {
113 	(void) pthread_mutex_lock(&g_lock);
114 	ASSERT3S(g_refcount, >, 0);
115 
116 	if (g_refcount > 0)
117 		g_refcount--;
118 
119 	if (g_refcount == 0 && g_fd != -1) {
120 		(void) close(g_fd);
121 		g_fd = -1;
122 	}
123 	(void) pthread_mutex_unlock(&g_lock);
124 }
125 
126 static int
127 lzc_ioctl(zfs_ioc_t ioc, const char *name,
128     nvlist_t *source, nvlist_t **resultp)
129 {
130 	zfs_cmd_t zc = { 0 };
131 	int error = 0;
132 	char *packed = NULL;
133 	size_t size = 0;
134 
135 	ASSERT3S(g_refcount, >, 0);
136 	VERIFY3S(g_fd, !=, -1);
137 
138 	if (name != NULL)
139 		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
140 
141 	if (source != NULL) {
142 		packed = fnvlist_pack(source, &size);
143 		zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
144 		zc.zc_nvlist_src_size = size;
145 	}
146 
147 	if (resultp != NULL) {
148 		*resultp = NULL;
149 		if (ioc == ZFS_IOC_CHANNEL_PROGRAM) {
150 			zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source,
151 			    ZCP_ARG_MEMLIMIT);
152 		} else {
153 			zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
154 		}
155 		zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
156 		    malloc(zc.zc_nvlist_dst_size);
157 		if (zc.zc_nvlist_dst == NULL) {
158 			error = ENOMEM;
159 			goto out;
160 		}
161 	}
162 
163 	while (ioctl(g_fd, ioc, &zc) != 0) {
164 		/*
165 		 * If ioctl exited with ENOMEM, we retry the ioctl after
166 		 * increasing the size of the destination nvlist.
167 		 *
168 		 * Channel programs that exit with ENOMEM ran over the
169 		 * lua memory sandbox; they should not be retried.
170 		 */
171 		if (errno == ENOMEM && resultp != NULL &&
172 		    ioc != ZFS_IOC_CHANNEL_PROGRAM) {
173 			free((void *)(uintptr_t)zc.zc_nvlist_dst);
174 			zc.zc_nvlist_dst_size *= 2;
175 			zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
176 			    malloc(zc.zc_nvlist_dst_size);
177 			if (zc.zc_nvlist_dst == NULL) {
178 				error = ENOMEM;
179 				goto out;
180 			}
181 		} else {
182 			error = errno;
183 			break;
184 		}
185 	}
186 	if (zc.zc_nvlist_dst_filled) {
187 		*resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
188 		    zc.zc_nvlist_dst_size);
189 	}
190 
191 out:
192 	fnvlist_pack_free(packed, size);
193 	free((void *)(uintptr_t)zc.zc_nvlist_dst);
194 	return (error);
195 }
196 
197 int
198 lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props)
199 {
200 	int error;
201 	nvlist_t *args = fnvlist_alloc();
202 	fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
203 	if (props != NULL)
204 		fnvlist_add_nvlist(args, "props", props);
205 	error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
206 	nvlist_free(args);
207 	return (error);
208 }
209 
210 int
211 lzc_clone(const char *fsname, const char *origin,
212     nvlist_t *props)
213 {
214 	int error;
215 	nvlist_t *args = fnvlist_alloc();
216 	fnvlist_add_string(args, "origin", origin);
217 	if (props != NULL)
218 		fnvlist_add_nvlist(args, "props", props);
219 	error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
220 	nvlist_free(args);
221 	return (error);
222 }
223 
224 int
225 lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen)
226 {
227 	/*
228 	 * The promote ioctl is still legacy, so we need to construct our
229 	 * own zfs_cmd_t rather than using lzc_ioctl().
230 	 */
231 	zfs_cmd_t zc = { 0 };
232 
233 	ASSERT3S(g_refcount, >, 0);
234 	VERIFY3S(g_fd, !=, -1);
235 
236 	(void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
237 	if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) {
238 		int error = errno;
239 		if (error == EEXIST && snapnamebuf != NULL)
240 			(void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen);
241 		return (error);
242 	}
243 	return (0);
244 }
245 
246 int
247 lzc_remap(const char *fsname)
248 {
249 	int error;
250 	nvlist_t *args = fnvlist_alloc();
251 	error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL);
252 	nvlist_free(args);
253 	return (error);
254 }
255 
256 int
257 lzc_rename(const char *source, const char *target)
258 {
259 	zfs_cmd_t zc = { 0 };
260 	int error;
261 
262 	ASSERT3S(g_refcount, >, 0);
263 	VERIFY3S(g_fd, !=, -1);
264 
265 	(void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name));
266 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
267 	error = ioctl(g_fd, ZFS_IOC_RENAME, &zc);
268 	if (error != 0)
269 		error = errno;
270 	return (error);
271 }
272 
273 int
274 lzc_destroy(const char *fsname)
275 {
276 	int error;
277 
278 	nvlist_t *args = fnvlist_alloc();
279 	error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL);
280 	nvlist_free(args);
281 	return (error);
282 }
283 
284 /*
285  * Creates snapshots.
286  *
287  * The keys in the snaps nvlist are the snapshots to be created.
288  * They must all be in the same pool.
289  *
290  * The props nvlist is properties to set.  Currently only user properties
291  * are supported.  { user:prop_name -> string value }
292  *
293  * The returned results nvlist will have an entry for each snapshot that failed.
294  * The value will be the (int32) error code.
295  *
296  * The return value will be 0 if all snapshots were created, otherwise it will
297  * be the errno of a (unspecified) snapshot that failed.
298  */
299 int
300 lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
301 {
302 	nvpair_t *elem;
303 	nvlist_t *args;
304 	int error;
305 	char pool[ZFS_MAX_DATASET_NAME_LEN];
306 
307 	*errlist = NULL;
308 
309 	/* determine the pool name */
310 	elem = nvlist_next_nvpair(snaps, NULL);
311 	if (elem == NULL)
312 		return (0);
313 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
314 	pool[strcspn(pool, "/@")] = '\0';
315 
316 	args = fnvlist_alloc();
317 	fnvlist_add_nvlist(args, "snaps", snaps);
318 	if (props != NULL)
319 		fnvlist_add_nvlist(args, "props", props);
320 
321 	error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
322 	nvlist_free(args);
323 
324 	return (error);
325 }
326 
327 /*
328  * Destroys snapshots.
329  *
330  * The keys in the snaps nvlist are the snapshots to be destroyed.
331  * They must all be in the same pool.
332  *
333  * Snapshots that do not exist will be silently ignored.
334  *
335  * If 'defer' is not set, and a snapshot has user holds or clones, the
336  * destroy operation will fail and none of the snapshots will be
337  * destroyed.
338  *
339  * If 'defer' is set, and a snapshot has user holds or clones, it will be
340  * marked for deferred destruction, and will be destroyed when the last hold
341  * or clone is removed/destroyed.
342  *
343  * The return value will be 0 if all snapshots were destroyed (or marked for
344  * later destruction if 'defer' is set) or didn't exist to begin with.
345  *
346  * Otherwise the return value will be the errno of a (unspecified) snapshot
347  * that failed, no snapshots will be destroyed, and the errlist will have an
348  * entry for each snapshot that failed.  The value in the errlist will be
349  * the (int32) error code.
350  */
351 int
352 lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist)
353 {
354 	nvpair_t *elem;
355 	nvlist_t *args;
356 	int error;
357 	char pool[ZFS_MAX_DATASET_NAME_LEN];
358 
359 	/* determine the pool name */
360 	elem = nvlist_next_nvpair(snaps, NULL);
361 	if (elem == NULL)
362 		return (0);
363 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
364 	pool[strcspn(pool, "/@")] = '\0';
365 
366 	args = fnvlist_alloc();
367 	fnvlist_add_nvlist(args, "snaps", snaps);
368 	if (defer)
369 		fnvlist_add_boolean(args, "defer");
370 
371 	error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
372 	nvlist_free(args);
373 
374 	return (error);
375 }
376 
377 int
378 lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
379     uint64_t *usedp)
380 {
381 	nvlist_t *args;
382 	nvlist_t *result;
383 	int err;
384 	char fs[ZFS_MAX_DATASET_NAME_LEN];
385 	char *atp;
386 
387 	/* determine the fs name */
388 	(void) strlcpy(fs, firstsnap, sizeof (fs));
389 	atp = strchr(fs, '@');
390 	if (atp == NULL)
391 		return (EINVAL);
392 	*atp = '\0';
393 
394 	args = fnvlist_alloc();
395 	fnvlist_add_string(args, "firstsnap", firstsnap);
396 
397 	err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
398 	nvlist_free(args);
399 	if (err == 0)
400 		*usedp = fnvlist_lookup_uint64(result, "used");
401 	fnvlist_free(result);
402 
403 	return (err);
404 }
405 
406 boolean_t
407 lzc_exists(const char *dataset)
408 {
409 	/*
410 	 * The objset_stats ioctl is still legacy, so we need to construct our
411 	 * own zfs_cmd_t rather than using lzc_ioctl().
412 	 */
413 	zfs_cmd_t zc = { 0 };
414 
415 	ASSERT3S(g_refcount, >, 0);
416 	VERIFY3S(g_fd, !=, -1);
417 
418 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
419 	return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
420 }
421 
422 /*
423  * outnvl is unused.
424  * It was added to preserve the function signature in case it is
425  * needed in the future.
426  */
427 /*ARGSUSED*/
428 int
429 lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl)
430 {
431 	return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL));
432 }
433 
434 /*
435  * Create "user holds" on snapshots.  If there is a hold on a snapshot,
436  * the snapshot can not be destroyed.  (However, it can be marked for deletion
437  * by lzc_destroy_snaps(defer=B_TRUE).)
438  *
439  * The keys in the nvlist are snapshot names.
440  * The snapshots must all be in the same pool.
441  * The value is the name of the hold (string type).
442  *
443  * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
444  * In this case, when the cleanup_fd is closed (including on process
445  * termination), the holds will be released.  If the system is shut down
446  * uncleanly, the holds will be released when the pool is next opened
447  * or imported.
448  *
449  * Holds for snapshots which don't exist will be skipped and have an entry
450  * added to errlist, but will not cause an overall failure.
451  *
452  * The return value will be 0 if all holds, for snapshots that existed,
453  * were succesfully created.
454  *
455  * Otherwise the return value will be the errno of a (unspecified) hold that
456  * failed and no holds will be created.
457  *
458  * In all cases the errlist will have an entry for each hold that failed
459  * (name = snapshot), with its value being the error code (int32).
460  */
461 int
462 lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
463 {
464 	char pool[ZFS_MAX_DATASET_NAME_LEN];
465 	nvlist_t *args;
466 	nvpair_t *elem;
467 	int error;
468 
469 	/* determine the pool name */
470 	elem = nvlist_next_nvpair(holds, NULL);
471 	if (elem == NULL)
472 		return (0);
473 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
474 	pool[strcspn(pool, "/@")] = '\0';
475 
476 	args = fnvlist_alloc();
477 	fnvlist_add_nvlist(args, "holds", holds);
478 	if (cleanup_fd != -1)
479 		fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
480 
481 	error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
482 	nvlist_free(args);
483 	return (error);
484 }
485 
486 /*
487  * Release "user holds" on snapshots.  If the snapshot has been marked for
488  * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
489  * any clones, and all the user holds are removed, then the snapshot will be
490  * destroyed.
491  *
492  * The keys in the nvlist are snapshot names.
493  * The snapshots must all be in the same pool.
494  * The value is a nvlist whose keys are the holds to remove.
495  *
496  * Holds which failed to release because they didn't exist will have an entry
497  * added to errlist, but will not cause an overall failure.
498  *
499  * The return value will be 0 if the nvl holds was empty or all holds that
500  * existed, were successfully removed.
501  *
502  * Otherwise the return value will be the errno of a (unspecified) hold that
503  * failed to release and no holds will be released.
504  *
505  * In all cases the errlist will have an entry for each hold that failed to
506  * to release.
507  */
508 int
509 lzc_release(nvlist_t *holds, nvlist_t **errlist)
510 {
511 	char pool[ZFS_MAX_DATASET_NAME_LEN];
512 	nvpair_t *elem;
513 
514 	/* determine the pool name */
515 	elem = nvlist_next_nvpair(holds, NULL);
516 	if (elem == NULL)
517 		return (0);
518 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
519 	pool[strcspn(pool, "/@")] = '\0';
520 
521 	return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
522 }
523 
524 /*
525  * Retrieve list of user holds on the specified snapshot.
526  *
527  * On success, *holdsp will be set to a nvlist which the caller must free.
528  * The keys are the names of the holds, and the value is the creation time
529  * of the hold (uint64) in seconds since the epoch.
530  */
531 int
532 lzc_get_holds(const char *snapname, nvlist_t **holdsp)
533 {
534 	return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp));
535 }
536 
537 /*
538  * Generate a zfs send stream for the specified snapshot and write it to
539  * the specified file descriptor.
540  *
541  * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
542  *
543  * If "from" is NULL, a full (non-incremental) stream will be sent.
544  * If "from" is non-NULL, it must be the full name of a snapshot or
545  * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
546  * "pool/fs#earlier_bmark").  If non-NULL, the specified snapshot or
547  * bookmark must represent an earlier point in the history of "snapname").
548  * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
549  * or it can be the origin of "snapname"'s filesystem, or an earlier
550  * snapshot in the origin, etc.
551  *
552  * "fd" is the file descriptor to write the send stream to.
553  *
554  * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
555  * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
556  * records with drr_blksz > 128K.
557  *
558  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
559  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
560  * which the receiving system must support (as indicated by support
561  * for the "embedded_data" feature).
562  */
563 int
564 lzc_send(const char *snapname, const char *from, int fd,
565     enum lzc_send_flags flags)
566 {
567 	return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
568 }
569 
570 int
571 lzc_send_resume(const char *snapname, const char *from, int fd,
572     enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
573 {
574 	nvlist_t *args;
575 	int err;
576 
577 	args = fnvlist_alloc();
578 	fnvlist_add_int32(args, "fd", fd);
579 	if (from != NULL)
580 		fnvlist_add_string(args, "fromsnap", from);
581 	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
582 		fnvlist_add_boolean(args, "largeblockok");
583 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
584 		fnvlist_add_boolean(args, "embedok");
585 	if (flags & LZC_SEND_FLAG_COMPRESS)
586 		fnvlist_add_boolean(args, "compressok");
587 	if (resumeobj != 0 || resumeoff != 0) {
588 		fnvlist_add_uint64(args, "resume_object", resumeobj);
589 		fnvlist_add_uint64(args, "resume_offset", resumeoff);
590 	}
591 	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
592 	nvlist_free(args);
593 	return (err);
594 }
595 
596 /*
597  * "from" can be NULL, a snapshot, or a bookmark.
598  *
599  * If from is NULL, a full (non-incremental) stream will be estimated.  This
600  * is calculated very efficiently.
601  *
602  * If from is a snapshot, lzc_send_space uses the deadlists attached to
603  * each snapshot to efficiently estimate the stream size.
604  *
605  * If from is a bookmark, the indirect blocks in the destination snapshot
606  * are traversed, looking for blocks with a birth time since the creation TXG of
607  * the snapshot this bookmark was created from.  This will result in
608  * significantly more I/O and be less efficient than a send space estimation on
609  * an equivalent snapshot.
610  */
611 int
612 lzc_send_space(const char *snapname, const char *from,
613     enum lzc_send_flags flags, uint64_t *spacep)
614 {
615 	nvlist_t *args;
616 	nvlist_t *result;
617 	int err;
618 
619 	args = fnvlist_alloc();
620 	if (from != NULL)
621 		fnvlist_add_string(args, "from", from);
622 	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
623 		fnvlist_add_boolean(args, "largeblockok");
624 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
625 		fnvlist_add_boolean(args, "embedok");
626 	if (flags & LZC_SEND_FLAG_COMPRESS)
627 		fnvlist_add_boolean(args, "compressok");
628 	err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
629 	nvlist_free(args);
630 	if (err == 0)
631 		*spacep = fnvlist_lookup_uint64(result, "space");
632 	nvlist_free(result);
633 	return (err);
634 }
635 
636 static int
637 recv_read(int fd, void *buf, int ilen)
638 {
639 	char *cp = buf;
640 	int rv;
641 	int len = ilen;
642 
643 	do {
644 		rv = read(fd, cp, len);
645 		cp += rv;
646 		len -= rv;
647 	} while (rv > 0);
648 
649 	if (rv < 0 || len != 0)
650 		return (EIO);
651 
652 	return (0);
653 }
654 
655 static int
656 recv_impl(const char *snapname, nvlist_t *props, const char *origin,
657     boolean_t force, boolean_t resumable, int fd,
658     const dmu_replay_record_t *begin_record)
659 {
660 	/*
661 	 * The receive ioctl is still legacy, so we need to construct our own
662 	 * zfs_cmd_t rather than using zfsc_ioctl().
663 	 */
664 	zfs_cmd_t zc = { 0 };
665 	char *atp;
666 	char *packed = NULL;
667 	size_t size;
668 	int error;
669 
670 	ASSERT3S(g_refcount, >, 0);
671 	VERIFY3S(g_fd, !=, -1);
672 
673 	/* zc_name is name of containing filesystem */
674 	(void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
675 	atp = strchr(zc.zc_name, '@');
676 	if (atp == NULL)
677 		return (EINVAL);
678 	*atp = '\0';
679 
680 	/* if the fs does not exist, try its parent. */
681 	if (!lzc_exists(zc.zc_name)) {
682 		char *slashp = strrchr(zc.zc_name, '/');
683 		if (slashp == NULL)
684 			return (ENOENT);
685 		*slashp = '\0';
686 
687 	}
688 
689 	/* zc_value is full name of the snapshot to create */
690 	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
691 
692 	if (props != NULL) {
693 		/* zc_nvlist_src is props to set */
694 		packed = fnvlist_pack(props, &size);
695 		zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
696 		zc.zc_nvlist_src_size = size;
697 	}
698 
699 	/* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
700 	if (origin != NULL)
701 		(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
702 
703 	/* zc_begin_record is non-byteswapped BEGIN record */
704 	if (begin_record == NULL) {
705 		error = recv_read(fd, &zc.zc_begin_record,
706 		    sizeof (zc.zc_begin_record));
707 		if (error != 0)
708 			goto out;
709 	} else {
710 		zc.zc_begin_record = *begin_record;
711 	}
712 
713 	/* zc_cookie is fd to read from */
714 	zc.zc_cookie = fd;
715 
716 	/* zc guid is force flag */
717 	zc.zc_guid = force;
718 
719 	zc.zc_resumable = resumable;
720 
721 	/* zc_cleanup_fd is unused */
722 	zc.zc_cleanup_fd = -1;
723 
724 	error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
725 	if (error != 0)
726 		error = errno;
727 
728 out:
729 	if (packed != NULL)
730 		fnvlist_pack_free(packed, size);
731 	free((void*)(uintptr_t)zc.zc_nvlist_dst);
732 	return (error);
733 }
734 
735 /*
736  * The simplest receive case: receive from the specified fd, creating the
737  * specified snapshot.  Apply the specified properties as "received" properties
738  * (which can be overridden by locally-set properties).  If the stream is a
739  * clone, its origin snapshot must be specified by 'origin'.  The 'force'
740  * flag will cause the target filesystem to be rolled back or destroyed if
741  * necessary to receive.
742  *
743  * Return 0 on success or an errno on failure.
744  *
745  * Note: this interface does not work on dedup'd streams
746  * (those with DMU_BACKUP_FEATURE_DEDUP).
747  */
748 int
749 lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
750     boolean_t force, int fd)
751 {
752 	return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
753 }
754 
755 /*
756  * Like lzc_receive, but if the receive fails due to premature stream
757  * termination, the intermediate state will be preserved on disk.  In this
758  * case, ECKSUM will be returned.  The receive may subsequently be resumed
759  * with a resuming send stream generated by lzc_send_resume().
760  */
761 int
762 lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
763     boolean_t force, int fd)
764 {
765 	return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
766 }
767 
768 /*
769  * Like lzc_receive, but allows the caller to read the begin record and then to
770  * pass it in.  That could be useful if the caller wants to derive, for example,
771  * the snapname or the origin parameters based on the information contained in
772  * the begin record.
773  * The begin record must be in its original form as read from the stream,
774  * in other words, it should not be byteswapped.
775  *
776  * The 'resumable' parameter allows to obtain the same behavior as with
777  * lzc_receive_resumable.
778  */
779 int
780 lzc_receive_with_header(const char *snapname, nvlist_t *props,
781     const char *origin, boolean_t force, boolean_t resumable, int fd,
782     const dmu_replay_record_t *begin_record)
783 {
784 	if (begin_record == NULL)
785 		return (EINVAL);
786 	return (recv_impl(snapname, props, origin, force, resumable, fd,
787 	    begin_record));
788 }
789 
790 /*
791  * Roll back this filesystem or volume to its most recent snapshot.
792  * If snapnamebuf is not NULL, it will be filled in with the name
793  * of the most recent snapshot.
794  * Note that the latest snapshot may change if a new one is concurrently
795  * created or the current one is destroyed.  lzc_rollback_to can be used
796  * to roll back to a specific latest snapshot.
797  *
798  * Return 0 on success or an errno on failure.
799  */
800 int
801 lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen)
802 {
803 	nvlist_t *args;
804 	nvlist_t *result;
805 	int err;
806 
807 	args = fnvlist_alloc();
808 	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
809 	nvlist_free(args);
810 	if (err == 0 && snapnamebuf != NULL) {
811 		const char *snapname = fnvlist_lookup_string(result, "target");
812 		(void) strlcpy(snapnamebuf, snapname, snapnamelen);
813 	}
814 	nvlist_free(result);
815 
816 	return (err);
817 }
818 
819 /*
820  * Roll back this filesystem or volume to the specified snapshot,
821  * if possible.
822  *
823  * Return 0 on success or an errno on failure.
824  */
825 int
826 lzc_rollback_to(const char *fsname, const char *snapname)
827 {
828 	nvlist_t *args;
829 	nvlist_t *result;
830 	int err;
831 
832 	args = fnvlist_alloc();
833 	fnvlist_add_string(args, "target", snapname);
834 	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
835 	nvlist_free(args);
836 	nvlist_free(result);
837 	return (err);
838 }
839 
840 /*
841  * Creates bookmarks.
842  *
843  * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
844  * the name of the snapshot (e.g. "pool/fs@snap").  All the bookmarks and
845  * snapshots must be in the same pool.
846  *
847  * The returned results nvlist will have an entry for each bookmark that failed.
848  * The value will be the (int32) error code.
849  *
850  * The return value will be 0 if all bookmarks were created, otherwise it will
851  * be the errno of a (undetermined) bookmarks that failed.
852  */
853 int
854 lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist)
855 {
856 	nvpair_t *elem;
857 	int error;
858 	char pool[ZFS_MAX_DATASET_NAME_LEN];
859 
860 	/* determine the pool name */
861 	elem = nvlist_next_nvpair(bookmarks, NULL);
862 	if (elem == NULL)
863 		return (0);
864 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
865 	pool[strcspn(pool, "/#")] = '\0';
866 
867 	error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
868 
869 	return (error);
870 }
871 
872 /*
873  * Retrieve bookmarks.
874  *
875  * Retrieve the list of bookmarks for the given file system. The props
876  * parameter is an nvlist of property names (with no values) that will be
877  * returned for each bookmark.
878  *
879  * The following are valid properties on bookmarks, all of which are numbers
880  * (represented as uint64 in the nvlist)
881  *
882  * "guid" - globally unique identifier of the snapshot it refers to
883  * "createtxg" - txg when the snapshot it refers to was created
884  * "creation" - timestamp when the snapshot it refers to was created
885  *
886  * The format of the returned nvlist as follows:
887  * <short name of bookmark> -> {
888  *     <name of property> -> {
889  *         "value" -> uint64
890  *     }
891  *  }
892  */
893 int
894 lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks)
895 {
896 	return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
897 }
898 
899 /*
900  * Destroys bookmarks.
901  *
902  * The keys in the bmarks nvlist are the bookmarks to be destroyed.
903  * They must all be in the same pool.  Bookmarks are specified as
904  * <fs>#<bmark>.
905  *
906  * Bookmarks that do not exist will be silently ignored.
907  *
908  * The return value will be 0 if all bookmarks that existed were destroyed.
909  *
910  * Otherwise the return value will be the errno of a (undetermined) bookmark
911  * that failed, no bookmarks will be destroyed, and the errlist will have an
912  * entry for each bookmarks that failed.  The value in the errlist will be
913  * the (int32) error code.
914  */
915 int
916 lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
917 {
918 	nvpair_t *elem;
919 	int error;
920 	char pool[ZFS_MAX_DATASET_NAME_LEN];
921 
922 	/* determine the pool name */
923 	elem = nvlist_next_nvpair(bmarks, NULL);
924 	if (elem == NULL)
925 		return (0);
926 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
927 	pool[strcspn(pool, "/#")] = '\0';
928 
929 	error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
930 
931 	return (error);
932 }
933 
934 static int
935 lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync,
936     uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
937 {
938 	int error;
939 	nvlist_t *args;
940 
941 	args = fnvlist_alloc();
942 	fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
943 	fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
944 	fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
945 	fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
946 	fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
947 	error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
948 	fnvlist_free(args);
949 
950 	return (error);
951 }
952 
953 /*
954  * Executes a channel program.
955  *
956  * If this function returns 0 the channel program was successfully loaded and
957  * ran without failing. Note that individual commands the channel program ran
958  * may have failed and the channel program is responsible for reporting such
959  * errors through outnvl if they are important.
960  *
961  * This method may also return:
962  *
963  * EINVAL   The program contains syntax errors, or an invalid memory or time
964  *          limit was given. No part of the channel program was executed.
965  *          If caused by syntax errors, 'outnvl' contains information about the
966  *          errors.
967  *
968  * ECHRNG   The program was executed, but encountered a runtime error, such as
969  *          calling a function with incorrect arguments, invoking the error()
970  *          function directly, failing an assert() command, etc. Some portion
971  *          of the channel program may have executed and committed changes.
972  *          Information about the failure can be found in 'outnvl'.
973  *
974  * ENOMEM   The program fully executed, but the output buffer was not large
975  *          enough to store the returned value. No output is returned through
976  *          'outnvl'.
977  *
978  * ENOSPC   The program was terminated because it exceeded its memory usage
979  *          limit. Some portion of the channel program may have executed and
980  *          committed changes to disk. No output is returned through 'outnvl'.
981  *
982  * ETIME    The program was terminated because it exceeded its Lua instruction
983  *          limit. Some portion of the channel program may have executed and
984  *          committed changes to disk. No output is returned through 'outnvl'.
985  */
986 int
987 lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
988     uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
989 {
990 	return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
991 	    memlimit, argnvl, outnvl));
992 }
993 
994 /*
995  * Creates a checkpoint for the specified pool.
996  *
997  * If this function returns 0 the pool was successfully checkpointed.
998  *
999  * This method may also return:
1000  *
1001  * ZFS_ERR_CHECKPOINT_EXISTS
1002  *	The pool already has a checkpoint. A pools can only have one
1003  *	checkpoint at most, at any given time.
1004  *
1005  * ZFS_ERR_DISCARDING_CHECKPOINT
1006  * 	ZFS is in the middle of discarding a checkpoint for this pool.
1007  * 	The pool can be checkpointed again once the discard is done.
1008  *
1009  * ZFS_DEVRM_IN_PROGRESS
1010  * 	A vdev is currently being removed. The pool cannot be
1011  * 	checkpointed until the device removal is done.
1012  *
1013  * ZFS_VDEV_TOO_BIG
1014  * 	One or more top-level vdevs exceed the maximum vdev size
1015  * 	supported for this feature.
1016  */
1017 int
1018 lzc_pool_checkpoint(const char *pool)
1019 {
1020 	int error;
1021 
1022 	nvlist_t *result = NULL;
1023 	nvlist_t *args = fnvlist_alloc();
1024 
1025 	error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
1026 
1027 	fnvlist_free(args);
1028 	fnvlist_free(result);
1029 
1030 	return (error);
1031 }
1032 
1033 /*
1034  * Discard the checkpoint from the specified pool.
1035  *
1036  * If this function returns 0 the checkpoint was successfully discarded.
1037  *
1038  * This method may also return:
1039  *
1040  * ZFS_ERR_NO_CHECKPOINT
1041  * 	The pool does not have a checkpoint.
1042  *
1043  * ZFS_ERR_DISCARDING_CHECKPOINT
1044  * 	ZFS is already in the middle of discarding the checkpoint.
1045  */
1046 int
1047 lzc_pool_checkpoint_discard(const char *pool)
1048 {
1049 	int error;
1050 
1051 	nvlist_t *result = NULL;
1052 	nvlist_t *args = fnvlist_alloc();
1053 
1054 	error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
1055 
1056 	fnvlist_free(args);
1057 	fnvlist_free(result);
1058 
1059 	return (error);
1060 }
1061 
1062 /*
1063  * Executes a read-only channel program.
1064  *
1065  * A read-only channel program works programmatically the same way as a
1066  * normal channel program executed with lzc_channel_program(). The only
1067  * difference is it runs exclusively in open-context and therefore can
1068  * return faster. The downside to that, is that the program cannot change
1069  * on-disk state by calling functions from the zfs.sync submodule.
1070  *
1071  * The return values of this function (and their meaning) are exactly the
1072  * same as the ones described in lzc_channel_program().
1073  */
1074 int
1075 lzc_channel_program_nosync(const char *pool, const char *program,
1076     uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
1077 {
1078 	return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
1079 	    memlimit, argnvl, outnvl));
1080 }
1081 
1082 /*
1083  * Changes initializing state.
1084  *
1085  * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
1086  * The key is ignored.
1087  *
1088  * If there are errors related to vdev arguments, per-vdev errors are returned
1089  * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
1090  * guid is stringified with PRIu64, and errno is one of the following as
1091  * an int64_t:
1092  *	- ENODEV if the device was not found
1093  *	- EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
1094  *	- EROFS if the device is not writeable
1095  *	- EBUSY start requested but the device is already being initialized
1096  *	- ESRCH cancel/suspend requested but device is not being initialized
1097  *
1098  * If the errlist is empty, then return value will be:
1099  *	- EINVAL if one or more arguments was invalid
1100  *	- Other spa_open failures
1101  *	- 0 if the operation succeeded
1102  */
1103 int
1104 lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
1105     nvlist_t *vdevs, nvlist_t **errlist)
1106 {
1107 	int error;
1108 	nvlist_t *args = fnvlist_alloc();
1109 	fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
1110 	fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
1111 
1112 	error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist);
1113 
1114 	fnvlist_free(args);
1115 
1116 	return (error);
1117 }
1118