1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 Cyril Plisko. All rights reserved. 24 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/sysmacros.h> 30 #include <sys/cmn_err.h> 31 #include <sys/kmem.h> 32 #include <sys/thread.h> 33 #include <sys/file.h> 34 #include <sys/fcntl.h> 35 #include <sys/vfs.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zfs_znode.h> 38 #include <sys/zfs_dir.h> 39 #include <sys/zfs_acl.h> 40 #include <sys/zfs_fuid.h> 41 #include <sys/zfs_vnops.h> 42 #include <sys/spa.h> 43 #include <sys/zil.h> 44 #include <sys/byteorder.h> 45 #include <sys/stat.h> 46 #include <sys/acl.h> 47 #include <sys/atomic.h> 48 #include <sys/cred.h> 49 #include <sys/zpl.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/zfeature.h> 52 53 /* 54 * NB: FreeBSD expects to be able to do vnode locking in lookup and 55 * hold the locks across all subsequent VOPs until vput is called. 56 * This means that its zfs vnops routines can't do any internal locking. 57 * In order to have the same contract as the Linux vnops there would 58 * needed to be duplicate locked vnops. If the vnops were used more widely 59 * in common code this would likely be preferable. However, currently 60 * this is the only file where this is the case. 61 */ 62 63 /* 64 * Functions to replay ZFS intent log (ZIL) records 65 * The functions are called through a function vector (zfs_replay_vector) 66 * which is indexed by the transaction type. 67 */ 68 69 static void 70 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, 71 uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) 72 { 73 memset(vap, 0, sizeof (*vap)); 74 vap->va_mask = (uint_t)mask; 75 vap->va_mode = mode; 76 #if defined(__FreeBSD__) || defined(__APPLE__) 77 vap->va_type = IFTOVT(mode); 78 #endif 79 vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; 80 vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; 81 vap->va_rdev = zfs_cmpldev(rdev); 82 vap->va_nodeid = nodeid; 83 } 84 85 static int 86 zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) 87 { 88 (void) arg1, (void) arg2, (void) byteswap; 89 return (SET_ERROR(ENOTSUP)); 90 } 91 92 static void 93 zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) 94 { 95 xoptattr_t *xoap = NULL; 96 uint64_t *attrs; 97 uint64_t *crtime; 98 uint32_t *bitmap; 99 void *scanstamp; 100 int i; 101 102 xvap->xva_vattr.va_mask |= ATTR_XVATTR; 103 if ((xoap = xva_getxoptattr(xvap)) == NULL) { 104 xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */ 105 return; 106 } 107 108 ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); 109 110 bitmap = &lrattr->lr_attr_bitmap; 111 for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) 112 xvap->xva_reqattrmap[i] = *bitmap; 113 114 attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); 115 crtime = attrs + 1; 116 scanstamp = (caddr_t)(crtime + 2); 117 118 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) 119 xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); 120 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) 121 xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); 122 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) 123 xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); 124 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) 125 xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); 126 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) 127 xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); 128 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) 129 xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); 130 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) 131 xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); 132 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) 133 xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); 134 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) 135 xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); 136 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) 137 xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); 138 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) 139 xoap->xoa_av_quarantined = 140 ((*attrs & XAT0_AV_QUARANTINED) != 0); 141 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 142 ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); 143 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 144 ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); 145 146 memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); 147 } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 148 /* 149 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid 150 * at the same time, so we can share the same space. 151 */ 152 memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t)); 153 } 154 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) 155 xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); 156 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) 157 xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); 158 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) 159 xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); 160 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) 161 xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0); 162 } 163 164 static int 165 zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) 166 { 167 uint64_t uid_idx; 168 uint64_t gid_idx; 169 int domcnt = 0; 170 171 uid_idx = FUID_INDEX(uid); 172 gid_idx = FUID_INDEX(gid); 173 if (uid_idx) 174 domcnt++; 175 if (gid_idx > 0 && gid_idx != uid_idx) 176 domcnt++; 177 178 return (domcnt); 179 } 180 181 static void * 182 zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, 183 int domcnt) 184 { 185 int i; 186 187 for (i = 0; i != domcnt; i++) { 188 fuid_infop->z_domain_table[i] = start; 189 start = (caddr_t)start + strlen(start) + 1; 190 } 191 192 return (start); 193 } 194 195 /* 196 * Set the uid/gid in the fuid_info structure. 197 */ 198 static void 199 zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) 200 { 201 /* 202 * If owner or group are log specific FUIDs then slurp up 203 * domain information and build zfs_fuid_info_t 204 */ 205 if (IS_EPHEMERAL(uid)) 206 fuid_infop->z_fuid_owner = uid; 207 208 if (IS_EPHEMERAL(gid)) 209 fuid_infop->z_fuid_group = gid; 210 } 211 212 /* 213 * Load fuid domains into fuid_info_t 214 */ 215 static zfs_fuid_info_t * 216 zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) 217 { 218 int domcnt; 219 220 zfs_fuid_info_t *fuid_infop; 221 222 fuid_infop = zfs_fuid_info_alloc(); 223 224 domcnt = zfs_replay_domain_cnt(uid, gid); 225 226 if (domcnt == 0) 227 return (fuid_infop); 228 229 fuid_infop->z_domain_table = 230 kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); 231 232 zfs_replay_fuid_ugid(fuid_infop, uid, gid); 233 234 fuid_infop->z_domain_cnt = domcnt; 235 *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); 236 return (fuid_infop); 237 } 238 239 /* 240 * load zfs_fuid_t's and fuid_domains into fuid_info_t 241 */ 242 static zfs_fuid_info_t * 243 zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, 244 uint64_t gid) 245 { 246 uint64_t *log_fuid = (uint64_t *)start; 247 zfs_fuid_info_t *fuid_infop; 248 int i; 249 250 fuid_infop = zfs_fuid_info_alloc(); 251 fuid_infop->z_domain_cnt = domcnt; 252 253 fuid_infop->z_domain_table = 254 kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); 255 256 for (i = 0; i != idcnt; i++) { 257 zfs_fuid_t *zfuid; 258 259 zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); 260 zfuid->z_logfuid = *log_fuid; 261 zfuid->z_id = -1; 262 zfuid->z_domidx = 0; 263 list_insert_tail(&fuid_infop->z_fuids, zfuid); 264 log_fuid++; 265 } 266 267 zfs_replay_fuid_ugid(fuid_infop, uid, gid); 268 269 *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); 270 return (fuid_infop); 271 } 272 273 static void 274 zfs_replay_swap_attrs(lr_attr_t *lrattr) 275 { 276 /* swap the lr_attr structure */ 277 byteswap_uint32_array(lrattr, sizeof (*lrattr)); 278 /* swap the bitmap */ 279 byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * 280 sizeof (uint32_t)); 281 /* swap the attributes, create time + 64 bit word for attributes */ 282 byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * 283 (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); 284 } 285 286 /* 287 * Replay file create with optional ACL, xvattr information as well 288 * as option FUID information. 289 */ 290 static int 291 zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) 292 { 293 zfsvfs_t *zfsvfs = arg1; 294 lr_acl_create_t *lracl = arg2; 295 char *name = NULL; /* location determined later */ 296 lr_create_t *lr = (lr_create_t *)lracl; 297 znode_t *dzp; 298 znode_t *zp; 299 xvattr_t xva; 300 int vflg = 0; 301 vsecattr_t vsec = { 0 }; 302 lr_attr_t *lrattr; 303 void *aclstart; 304 void *fuidstart; 305 size_t xvatlen = 0; 306 uint64_t txtype; 307 uint64_t objid; 308 uint64_t dnodesize; 309 int error; 310 311 txtype = (lr->lr_common.lrc_txtype & ~TX_CI); 312 if (byteswap) { 313 byteswap_uint64_array(lracl, sizeof (*lracl)); 314 if (txtype == TX_CREATE_ACL_ATTR || 315 txtype == TX_MKDIR_ACL_ATTR) { 316 lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); 317 zfs_replay_swap_attrs(lrattr); 318 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); 319 } 320 321 aclstart = (caddr_t)(lracl + 1) + xvatlen; 322 zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); 323 /* swap fuids */ 324 if (lracl->lr_fuidcnt) { 325 byteswap_uint64_array((caddr_t)aclstart + 326 ZIL_ACE_LENGTH(lracl->lr_acl_bytes), 327 lracl->lr_fuidcnt * sizeof (uint64_t)); 328 } 329 } 330 331 if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) 332 return (error); 333 334 objid = LR_FOID_GET_OBJ(lr->lr_foid); 335 dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; 336 337 xva_init(&xva); 338 zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, 339 lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); 340 341 /* 342 * All forms of zfs create (create, mkdir, mkxattrdir, symlink) 343 * eventually end up in zfs_mknode(), which assigns the object's 344 * creation time, generation number, and dnode size. The generic 345 * zfs_create() has no concept of these attributes, so we smuggle 346 * the values inside the vattr's otherwise unused va_ctime, 347 * va_nblocks, and va_fsid fields. 348 */ 349 ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); 350 xva.xva_vattr.va_nblocks = lr->lr_gen; 351 xva.xva_vattr.va_fsid = dnodesize; 352 353 error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); 354 if (error) 355 goto bail; 356 357 if (lr->lr_common.lrc_txtype & TX_CI) 358 vflg |= FIGNORECASE; 359 switch (txtype) { 360 case TX_CREATE_ACL: 361 aclstart = (caddr_t)(lracl + 1); 362 fuidstart = (caddr_t)aclstart + 363 ZIL_ACE_LENGTH(lracl->lr_acl_bytes); 364 zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, 365 (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, 366 lr->lr_uid, lr->lr_gid); 367 zfs_fallthrough; 368 case TX_CREATE_ACL_ATTR: 369 if (name == NULL) { 370 lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); 371 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); 372 xva.xva_vattr.va_mask |= ATTR_XVATTR; 373 zfs_replay_xvattr(lrattr, &xva); 374 } 375 vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; 376 vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; 377 vsec.vsa_aclcnt = lracl->lr_aclcnt; 378 vsec.vsa_aclentsz = lracl->lr_acl_bytes; 379 vsec.vsa_aclflags = lracl->lr_acl_flags; 380 if (zfsvfs->z_fuid_replay == NULL) { 381 fuidstart = (caddr_t)(lracl + 1) + xvatlen + 382 ZIL_ACE_LENGTH(lracl->lr_acl_bytes); 383 zfsvfs->z_fuid_replay = 384 zfs_replay_fuids(fuidstart, 385 (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, 386 lr->lr_uid, lr->lr_gid); 387 } 388 389 error = zfs_create(dzp, name, &xva.xva_vattr, 390 0, 0, &zp, kcred, vflg, &vsec); 391 break; 392 case TX_MKDIR_ACL: 393 aclstart = (caddr_t)(lracl + 1); 394 fuidstart = (caddr_t)aclstart + 395 ZIL_ACE_LENGTH(lracl->lr_acl_bytes); 396 zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, 397 (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, 398 lr->lr_uid, lr->lr_gid); 399 zfs_fallthrough; 400 case TX_MKDIR_ACL_ATTR: 401 if (name == NULL) { 402 lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); 403 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); 404 zfs_replay_xvattr(lrattr, &xva); 405 } 406 vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; 407 vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; 408 vsec.vsa_aclcnt = lracl->lr_aclcnt; 409 vsec.vsa_aclentsz = lracl->lr_acl_bytes; 410 vsec.vsa_aclflags = lracl->lr_acl_flags; 411 if (zfsvfs->z_fuid_replay == NULL) { 412 fuidstart = (caddr_t)(lracl + 1) + xvatlen + 413 ZIL_ACE_LENGTH(lracl->lr_acl_bytes); 414 zfsvfs->z_fuid_replay = 415 zfs_replay_fuids(fuidstart, 416 (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, 417 lr->lr_uid, lr->lr_gid); 418 } 419 error = zfs_mkdir(dzp, name, &xva.xva_vattr, 420 &zp, kcred, vflg, &vsec); 421 break; 422 default: 423 error = SET_ERROR(ENOTSUP); 424 } 425 426 bail: 427 if (error == 0 && zp != NULL) { 428 #ifdef __FreeBSD__ 429 VOP_UNLOCK1(ZTOV(zp)); 430 #endif 431 zrele(zp); 432 } 433 zrele(dzp); 434 435 if (zfsvfs->z_fuid_replay) 436 zfs_fuid_info_free(zfsvfs->z_fuid_replay); 437 zfsvfs->z_fuid_replay = NULL; 438 439 return (error); 440 } 441 442 static int 443 zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) 444 { 445 zfsvfs_t *zfsvfs = arg1; 446 lr_create_t *lr = arg2; 447 char *name = NULL; /* location determined later */ 448 char *link; /* symlink content follows name */ 449 znode_t *dzp; 450 znode_t *zp = NULL; 451 xvattr_t xva; 452 int vflg = 0; 453 size_t lrsize = sizeof (lr_create_t); 454 lr_attr_t *lrattr; 455 void *start; 456 size_t xvatlen; 457 uint64_t txtype; 458 uint64_t objid; 459 uint64_t dnodesize; 460 int error; 461 462 txtype = (lr->lr_common.lrc_txtype & ~TX_CI); 463 if (byteswap) { 464 byteswap_uint64_array(lr, sizeof (*lr)); 465 if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) 466 zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); 467 } 468 469 470 if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) 471 return (error); 472 473 objid = LR_FOID_GET_OBJ(lr->lr_foid); 474 dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; 475 476 xva_init(&xva); 477 zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, 478 lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); 479 480 /* 481 * All forms of zfs create (create, mkdir, mkxattrdir, symlink) 482 * eventually end up in zfs_mknode(), which assigns the object's 483 * creation time, generation number, and dnode slot count. The 484 * generic zfs_create() has no concept of these attributes, so 485 * we smuggle the values inside the vattr's otherwise unused 486 * va_ctime, va_nblocks, and va_fsid fields. 487 */ 488 ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); 489 xva.xva_vattr.va_nblocks = lr->lr_gen; 490 xva.xva_vattr.va_fsid = dnodesize; 491 492 error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); 493 if (error) 494 goto out; 495 496 if (lr->lr_common.lrc_txtype & TX_CI) 497 vflg |= FIGNORECASE; 498 499 /* 500 * Symlinks don't have fuid info, and CIFS never creates 501 * symlinks. 502 * 503 * The _ATTR versions will grab the fuid info in their subcases. 504 */ 505 if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && 506 (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && 507 (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { 508 start = (lr + 1); 509 zfsvfs->z_fuid_replay = 510 zfs_replay_fuid_domain(start, &start, 511 lr->lr_uid, lr->lr_gid); 512 } 513 514 switch (txtype) { 515 case TX_CREATE_ATTR: 516 lrattr = (lr_attr_t *)(caddr_t)(lr + 1); 517 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); 518 zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); 519 start = (caddr_t)(lr + 1) + xvatlen; 520 zfsvfs->z_fuid_replay = 521 zfs_replay_fuid_domain(start, &start, 522 lr->lr_uid, lr->lr_gid); 523 name = (char *)start; 524 zfs_fallthrough; 525 526 case TX_CREATE: 527 if (name == NULL) 528 name = (char *)start; 529 530 error = zfs_create(dzp, name, &xva.xva_vattr, 531 0, 0, &zp, kcred, vflg, NULL); 532 break; 533 case TX_MKDIR_ATTR: 534 lrattr = (lr_attr_t *)(caddr_t)(lr + 1); 535 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); 536 zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); 537 start = (caddr_t)(lr + 1) + xvatlen; 538 zfsvfs->z_fuid_replay = 539 zfs_replay_fuid_domain(start, &start, 540 lr->lr_uid, lr->lr_gid); 541 name = (char *)start; 542 zfs_fallthrough; 543 544 case TX_MKDIR: 545 if (name == NULL) 546 name = (char *)(lr + 1); 547 548 error = zfs_mkdir(dzp, name, &xva.xva_vattr, 549 &zp, kcred, vflg, NULL); 550 break; 551 case TX_MKXATTR: 552 error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); 553 break; 554 case TX_SYMLINK: 555 name = (char *)(lr + 1); 556 link = name + strlen(name) + 1; 557 error = zfs_symlink(dzp, name, &xva.xva_vattr, 558 link, &zp, kcred, vflg); 559 break; 560 default: 561 error = SET_ERROR(ENOTSUP); 562 } 563 564 out: 565 if (error == 0 && zp != NULL) { 566 #ifdef __FreeBSD__ 567 VOP_UNLOCK1(ZTOV(zp)); 568 #endif 569 zrele(zp); 570 } 571 zrele(dzp); 572 573 if (zfsvfs->z_fuid_replay) 574 zfs_fuid_info_free(zfsvfs->z_fuid_replay); 575 zfsvfs->z_fuid_replay = NULL; 576 return (error); 577 } 578 579 static int 580 zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 581 { 582 zfsvfs_t *zfsvfs = arg1; 583 lr_remove_t *lr = arg2; 584 char *name = (char *)(lr + 1); /* name follows lr_remove_t */ 585 znode_t *dzp; 586 int error; 587 int vflg = 0; 588 589 if (byteswap) 590 byteswap_uint64_array(lr, sizeof (*lr)); 591 592 if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) 593 return (error); 594 595 if (lr->lr_common.lrc_txtype & TX_CI) 596 vflg |= FIGNORECASE; 597 598 switch ((int)lr->lr_common.lrc_txtype) { 599 case TX_REMOVE: 600 error = zfs_remove(dzp, name, kcred, vflg); 601 break; 602 case TX_RMDIR: 603 error = zfs_rmdir(dzp, name, NULL, kcred, vflg); 604 break; 605 default: 606 error = SET_ERROR(ENOTSUP); 607 } 608 609 zrele(dzp); 610 611 return (error); 612 } 613 614 static int 615 zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) 616 { 617 zfsvfs_t *zfsvfs = arg1; 618 lr_link_t *lr = arg2; 619 char *name = (char *)(lr + 1); /* name follows lr_link_t */ 620 znode_t *dzp, *zp; 621 int error; 622 int vflg = 0; 623 624 if (byteswap) 625 byteswap_uint64_array(lr, sizeof (*lr)); 626 627 if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) 628 return (error); 629 630 if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { 631 zrele(dzp); 632 return (error); 633 } 634 635 if (lr->lr_common.lrc_txtype & TX_CI) 636 vflg |= FIGNORECASE; 637 638 error = zfs_link(dzp, zp, name, kcred, vflg); 639 zrele(zp); 640 zrele(dzp); 641 642 return (error); 643 } 644 645 static int 646 zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) 647 { 648 zfsvfs_t *zfsvfs = arg1; 649 lr_rename_t *lr = arg2; 650 char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ 651 char *tname = sname + strlen(sname) + 1; 652 znode_t *sdzp, *tdzp; 653 int error; 654 int vflg = 0; 655 656 if (byteswap) 657 byteswap_uint64_array(lr, sizeof (*lr)); 658 659 if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) 660 return (error); 661 662 if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { 663 zrele(sdzp); 664 return (error); 665 } 666 667 if (lr->lr_common.lrc_txtype & TX_CI) 668 vflg |= FIGNORECASE; 669 670 error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg); 671 672 zrele(tdzp); 673 zrele(sdzp); 674 return (error); 675 } 676 677 static int 678 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) 679 { 680 zfsvfs_t *zfsvfs = arg1; 681 lr_write_t *lr = arg2; 682 char *data = (char *)(lr + 1); /* data follows lr_write_t */ 683 znode_t *zp; 684 int error; 685 uint64_t eod, offset, length; 686 687 if (byteswap) 688 byteswap_uint64_array(lr, sizeof (*lr)); 689 690 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { 691 /* 692 * As we can log writes out of order, it's possible the 693 * file has been removed. In this case just drop the write 694 * and return success. 695 */ 696 if (error == ENOENT) 697 error = 0; 698 return (error); 699 } 700 701 offset = lr->lr_offset; 702 length = lr->lr_length; 703 eod = offset + length; /* end of data for this write */ 704 705 /* 706 * This may be a write from a dmu_sync() for a whole block, 707 * and may extend beyond the current end of the file. 708 * We can't just replay what was written for this TX_WRITE as 709 * a future TX_WRITE2 may extend the eof and the data for that 710 * write needs to be there. So we write the whole block and 711 * reduce the eof. This needs to be done within the single dmu 712 * transaction created within vn_rdwr -> zfs_write. So a possible 713 * new end of file is passed through in zfsvfs->z_replay_eof 714 */ 715 716 zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ 717 718 /* If it's a dmu_sync() block, write the whole block */ 719 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 720 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 721 if (length < blocksize) { 722 offset -= offset % blocksize; 723 length = blocksize; 724 } 725 if (zp->z_size < eod) 726 zfsvfs->z_replay_eof = eod; 727 } 728 error = zfs_write_simple(zp, data, length, offset, NULL); 729 zrele(zp); 730 zfsvfs->z_replay_eof = 0; /* safety */ 731 732 return (error); 733 } 734 735 /* 736 * TX_WRITE2 are only generated when dmu_sync() returns EALREADY 737 * meaning the pool block is already being synced. So now that we always write 738 * out full blocks, all we have to do is expand the eof if 739 * the file is grown. 740 */ 741 static int 742 zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) 743 { 744 zfsvfs_t *zfsvfs = arg1; 745 lr_write_t *lr = arg2; 746 znode_t *zp; 747 int error; 748 uint64_t end; 749 750 if (byteswap) 751 byteswap_uint64_array(lr, sizeof (*lr)); 752 753 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 754 return (error); 755 756 top: 757 end = lr->lr_offset + lr->lr_length; 758 if (end > zp->z_size) { 759 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 760 761 zp->z_size = end; 762 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 763 error = dmu_tx_assign(tx, TXG_WAIT); 764 if (error) { 765 zrele(zp); 766 if (error == ERESTART) { 767 dmu_tx_wait(tx); 768 dmu_tx_abort(tx); 769 goto top; 770 } 771 dmu_tx_abort(tx); 772 return (error); 773 } 774 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 775 (void *)&zp->z_size, sizeof (uint64_t), tx); 776 777 /* Ensure the replayed seq is updated */ 778 (void) zil_replaying(zfsvfs->z_log, tx); 779 780 dmu_tx_commit(tx); 781 } 782 783 zrele(zp); 784 785 return (error); 786 } 787 788 static int 789 zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 790 { 791 zfsvfs_t *zfsvfs = arg1; 792 lr_truncate_t *lr = arg2; 793 znode_t *zp; 794 flock64_t fl = {0}; 795 int error; 796 797 if (byteswap) 798 byteswap_uint64_array(lr, sizeof (*lr)); 799 800 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 801 return (error); 802 803 fl.l_type = F_WRLCK; 804 fl.l_whence = SEEK_SET; 805 fl.l_start = lr->lr_offset; 806 fl.l_len = lr->lr_length; 807 808 error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE, 809 lr->lr_offset, kcred); 810 811 zrele(zp); 812 813 return (error); 814 } 815 816 static int 817 zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 818 { 819 zfsvfs_t *zfsvfs = arg1; 820 lr_setattr_t *lr = arg2; 821 znode_t *zp; 822 xvattr_t xva; 823 vattr_t *vap = &xva.xva_vattr; 824 int error; 825 void *start; 826 827 xva_init(&xva); 828 if (byteswap) { 829 byteswap_uint64_array(lr, sizeof (*lr)); 830 831 if ((lr->lr_mask & ATTR_XVATTR) && 832 zfsvfs->z_version >= ZPL_VERSION_INITIAL) 833 zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); 834 } 835 836 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 837 return (error); 838 839 zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, 840 lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); 841 842 vap->va_size = lr->lr_size; 843 ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); 844 ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); 845 gethrestime(&vap->va_ctime); 846 vap->va_mask |= ATTR_CTIME; 847 848 /* 849 * Fill in xvattr_t portions if necessary. 850 */ 851 852 start = (lr_setattr_t *)(lr + 1); 853 if (vap->va_mask & ATTR_XVATTR) { 854 zfs_replay_xvattr((lr_attr_t *)start, &xva); 855 start = (caddr_t)start + 856 ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); 857 } else 858 xva.xva_vattr.va_mask &= ~ATTR_XVATTR; 859 860 zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, 861 lr->lr_uid, lr->lr_gid); 862 863 error = zfs_setattr(zp, vap, 0, kcred); 864 865 zfs_fuid_info_free(zfsvfs->z_fuid_replay); 866 zfsvfs->z_fuid_replay = NULL; 867 zrele(zp); 868 869 return (error); 870 } 871 872 static int 873 zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) 874 { 875 zfsvfs_t *zfsvfs = arg1; 876 lr_setsaxattr_t *lr = arg2; 877 znode_t *zp; 878 nvlist_t *nvl; 879 size_t sa_size; 880 char *name; 881 char *value; 882 size_t size; 883 int error = 0; 884 885 ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, 886 SPA_FEATURE_ZILSAXATTR)); 887 if (byteswap) 888 byteswap_uint64_array(lr, sizeof (*lr)); 889 890 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 891 return (error); 892 893 rw_enter(&zp->z_xattr_lock, RW_WRITER); 894 mutex_enter(&zp->z_lock); 895 if (zp->z_xattr_cached == NULL) 896 error = zfs_sa_get_xattr(zp); 897 mutex_exit(&zp->z_lock); 898 899 if (error) 900 goto out; 901 902 ASSERT(zp->z_xattr_cached); 903 nvl = zp->z_xattr_cached; 904 905 /* Get xattr name, value and size from log record */ 906 size = lr->lr_size; 907 name = (char *)(lr + 1); 908 if (size == 0) { 909 value = NULL; 910 error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); 911 } else { 912 value = name + strlen(name) + 1; 913 /* Limited to 32k to keep nvpair memory allocations small */ 914 if (size > DXATTR_MAX_ENTRY_SIZE) { 915 error = SET_ERROR(EFBIG); 916 goto out; 917 } 918 919 /* Prevent the DXATTR SA from consuming the entire SA region */ 920 error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); 921 if (error) 922 goto out; 923 924 if (sa_size > DXATTR_MAX_SA_SIZE) { 925 error = SET_ERROR(EFBIG); 926 goto out; 927 } 928 929 error = nvlist_add_byte_array(nvl, name, (uchar_t *)value, 930 size); 931 } 932 933 /* 934 * Update the SA for additions, modifications, and removals. On 935 * error drop the inconsistent cached version of the nvlist, it 936 * will be reconstructed from the ARC when next accessed. 937 */ 938 if (error == 0) 939 error = zfs_sa_set_xattr(zp, name, value, size); 940 941 if (error) { 942 nvlist_free(nvl); 943 zp->z_xattr_cached = NULL; 944 } 945 946 out: 947 rw_exit(&zp->z_xattr_lock); 948 zrele(zp); 949 return (error); 950 } 951 952 static int 953 zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) 954 { 955 zfsvfs_t *zfsvfs = arg1; 956 lr_acl_v0_t *lr = arg2; 957 ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ 958 vsecattr_t vsa = {0}; 959 znode_t *zp; 960 int error; 961 962 if (byteswap) { 963 byteswap_uint64_array(lr, sizeof (*lr)); 964 zfs_oldace_byteswap(ace, lr->lr_aclcnt); 965 } 966 967 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 968 return (error); 969 970 vsa.vsa_mask = VSA_ACE | VSA_ACECNT; 971 vsa.vsa_aclcnt = lr->lr_aclcnt; 972 vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; 973 vsa.vsa_aclflags = 0; 974 vsa.vsa_aclentp = ace; 975 976 error = zfs_setsecattr(zp, &vsa, 0, kcred); 977 978 zrele(zp); 979 980 return (error); 981 } 982 983 /* 984 * Replaying ACLs is complicated by FUID support. 985 * The log record may contain some optional data 986 * to be used for replaying FUID's. These pieces 987 * are the actual FUIDs that were created initially. 988 * The FUID table index may no longer be valid and 989 * during zfs_create() a new index may be assigned. 990 * Because of this the log will contain the original 991 * domain+rid in order to create a new FUID. 992 * 993 * The individual ACEs may contain an ephemeral uid/gid which is no 994 * longer valid and will need to be replaced with an actual FUID. 995 * 996 */ 997 static int 998 zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) 999 { 1000 zfsvfs_t *zfsvfs = arg1; 1001 lr_acl_t *lr = arg2; 1002 ace_t *ace = (ace_t *)(lr + 1); 1003 vsecattr_t vsa = {0}; 1004 znode_t *zp; 1005 int error; 1006 1007 if (byteswap) { 1008 byteswap_uint64_array(lr, sizeof (*lr)); 1009 zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); 1010 if (lr->lr_fuidcnt) { 1011 byteswap_uint64_array((caddr_t)ace + 1012 ZIL_ACE_LENGTH(lr->lr_acl_bytes), 1013 lr->lr_fuidcnt * sizeof (uint64_t)); 1014 } 1015 } 1016 1017 if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) 1018 return (error); 1019 1020 vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; 1021 vsa.vsa_aclcnt = lr->lr_aclcnt; 1022 vsa.vsa_aclentp = ace; 1023 vsa.vsa_aclentsz = lr->lr_acl_bytes; 1024 vsa.vsa_aclflags = lr->lr_acl_flags; 1025 1026 if (lr->lr_fuidcnt) { 1027 void *fuidstart = (caddr_t)ace + 1028 ZIL_ACE_LENGTH(lr->lr_acl_bytes); 1029 1030 zfsvfs->z_fuid_replay = 1031 zfs_replay_fuids(fuidstart, &fuidstart, 1032 lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); 1033 } 1034 1035 error = zfs_setsecattr(zp, &vsa, 0, kcred); 1036 1037 if (zfsvfs->z_fuid_replay) 1038 zfs_fuid_info_free(zfsvfs->z_fuid_replay); 1039 1040 zfsvfs->z_fuid_replay = NULL; 1041 zrele(zp); 1042 1043 return (error); 1044 } 1045 1046 /* 1047 * Callback vectors for replaying records 1048 */ 1049 zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { 1050 zfs_replay_error, /* no such type */ 1051 zfs_replay_create, /* TX_CREATE */ 1052 zfs_replay_create, /* TX_MKDIR */ 1053 zfs_replay_create, /* TX_MKXATTR */ 1054 zfs_replay_create, /* TX_SYMLINK */ 1055 zfs_replay_remove, /* TX_REMOVE */ 1056 zfs_replay_remove, /* TX_RMDIR */ 1057 zfs_replay_link, /* TX_LINK */ 1058 zfs_replay_rename, /* TX_RENAME */ 1059 zfs_replay_write, /* TX_WRITE */ 1060 zfs_replay_truncate, /* TX_TRUNCATE */ 1061 zfs_replay_setattr, /* TX_SETATTR */ 1062 zfs_replay_acl_v0, /* TX_ACL_V0 */ 1063 zfs_replay_acl, /* TX_ACL */ 1064 zfs_replay_create_acl, /* TX_CREATE_ACL */ 1065 zfs_replay_create, /* TX_CREATE_ATTR */ 1066 zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ 1067 zfs_replay_create_acl, /* TX_MKDIR_ACL */ 1068 zfs_replay_create, /* TX_MKDIR_ATTR */ 1069 zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ 1070 zfs_replay_write2, /* TX_WRITE2 */ 1071 zfs_replay_setsaxattr, /* TX_SETSAXATTR */ 1072 }; 1073