1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2015, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2022 by Pawel Jakub Dawidek 26 */ 27 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/cmn_err.h> 33 #include <sys/kmem.h> 34 #include <sys/thread.h> 35 #include <sys/file.h> 36 #include <sys/vfs.h> 37 #include <sys/zfs_znode.h> 38 #include <sys/zfs_dir.h> 39 #include <sys/zil.h> 40 #include <sys/zil_impl.h> 41 #include <sys/byteorder.h> 42 #include <sys/policy.h> 43 #include <sys/stat.h> 44 #include <sys/acl.h> 45 #include <sys/dmu.h> 46 #include <sys/dbuf.h> 47 #include <sys/spa.h> 48 #include <sys/zfs_fuid.h> 49 #include <sys/dsl_dataset.h> 50 51 /* 52 * These zfs_log_* functions must be called within a dmu tx, in one 53 * of 2 contexts depending on zilog->z_replay: 54 * 55 * Non replay mode 56 * --------------- 57 * We need to record the transaction so that if it is committed to 58 * the Intent Log then it can be replayed. An intent log transaction 59 * structure (itx_t) is allocated and all the information necessary to 60 * possibly replay the transaction is saved in it. The itx is then assigned 61 * a sequence number and inserted in the in-memory list anchored in the zilog. 62 * 63 * Replay mode 64 * ----------- 65 * We need to mark the intent log record as replayed in the log header. 66 * This is done in the same transaction as the replay so that they 67 * commit atomically. 68 */ 69 70 int 71 zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) 72 { 73 int isxvattr = (vap->va_mask & ATTR_XVATTR); 74 switch (type) { 75 case Z_FILE: 76 if (vsecp == NULL && !isxvattr) 77 return (TX_CREATE); 78 if (vsecp && isxvattr) 79 return (TX_CREATE_ACL_ATTR); 80 if (vsecp) 81 return (TX_CREATE_ACL); 82 else 83 return (TX_CREATE_ATTR); 84 case Z_DIR: 85 if (vsecp == NULL && !isxvattr) 86 return (TX_MKDIR); 87 if (vsecp && isxvattr) 88 return (TX_MKDIR_ACL_ATTR); 89 if (vsecp) 90 return (TX_MKDIR_ACL); 91 else 92 return (TX_MKDIR_ATTR); 93 case Z_XATTRDIR: 94 return (TX_MKXATTR); 95 } 96 ASSERT(0); 97 return (TX_MAX_TYPE); 98 } 99 100 /* 101 * build up the log data necessary for logging xvattr_t 102 * First lr_attr_t is initialized. following the lr_attr_t 103 * is the mapsize and attribute bitmap copied from the xvattr_t. 104 * Following the bitmap and bitmapsize two 64 bit words are reserved 105 * for the create time which may be set. Following the create time 106 * records a single 64 bit integer which has the bits to set on 107 * replay for the xvattr. 108 */ 109 static void 110 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) 111 { 112 xoptattr_t *xoap; 113 114 xoap = xva_getxoptattr(xvap); 115 ASSERT(xoap); 116 117 lrattr->lr_attr_masksize = xvap->xva_mapsize; 118 uint32_t *bitmap = &lrattr->lr_attr_bitmap; 119 for (int i = 0; i != xvap->xva_mapsize; i++, bitmap++) 120 *bitmap = xvap->xva_reqattrmap[i]; 121 122 lr_attr_end_t *end = (lr_attr_end_t *)bitmap; 123 end->lr_attr_attrs = 0; 124 end->lr_attr_crtime[0] = 0; 125 end->lr_attr_crtime[1] = 0; 126 memset(end->lr_attr_scanstamp, 0, AV_SCANSTAMP_SZ); 127 128 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) 129 end->lr_attr_attrs |= (xoap->xoa_readonly == 0) ? 0 : 130 XAT0_READONLY; 131 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) 132 end->lr_attr_attrs |= (xoap->xoa_hidden == 0) ? 0 : 133 XAT0_HIDDEN; 134 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) 135 end->lr_attr_attrs |= (xoap->xoa_system == 0) ? 0 : 136 XAT0_SYSTEM; 137 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) 138 end->lr_attr_attrs |= (xoap->xoa_archive == 0) ? 0 : 139 XAT0_ARCHIVE; 140 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) 141 end->lr_attr_attrs |= (xoap->xoa_immutable == 0) ? 0 : 142 XAT0_IMMUTABLE; 143 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) 144 end->lr_attr_attrs |= (xoap->xoa_nounlink == 0) ? 0 : 145 XAT0_NOUNLINK; 146 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) 147 end->lr_attr_attrs |= (xoap->xoa_appendonly == 0) ? 0 : 148 XAT0_APPENDONLY; 149 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) 150 end->lr_attr_attrs |= (xoap->xoa_opaque == 0) ? 0 : 151 XAT0_APPENDONLY; 152 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) 153 end->lr_attr_attrs |= (xoap->xoa_nodump == 0) ? 0 : 154 XAT0_NODUMP; 155 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) 156 end->lr_attr_attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : 157 XAT0_AV_QUARANTINED; 158 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) 159 end->lr_attr_attrs |= (xoap->xoa_av_modified == 0) ? 0 : 160 XAT0_AV_MODIFIED; 161 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 162 ZFS_TIME_ENCODE(&xoap->xoa_createtime, end->lr_attr_crtime); 163 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 164 ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); 165 166 memcpy(end->lr_attr_scanstamp, xoap->xoa_av_scanstamp, 167 AV_SCANSTAMP_SZ); 168 } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 169 /* 170 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid 171 * at the same time, so we can share the same space. 172 */ 173 memcpy(end->lr_attr_scanstamp, &xoap->xoa_projid, 174 sizeof (uint64_t)); 175 } 176 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) 177 end->lr_attr_attrs |= (xoap->xoa_reparse == 0) ? 0 : 178 XAT0_REPARSE; 179 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) 180 end->lr_attr_attrs |= (xoap->xoa_offline == 0) ? 0 : 181 XAT0_OFFLINE; 182 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) 183 end->lr_attr_attrs |= (xoap->xoa_sparse == 0) ? 0 : 184 XAT0_SPARSE; 185 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) 186 end->lr_attr_attrs |= (xoap->xoa_projinherit == 0) ? 0 : 187 XAT0_PROJINHERIT; 188 } 189 190 static void * 191 zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) 192 { 193 zfs_fuid_t *zfuid; 194 uint64_t *fuidloc = start; 195 196 /* First copy in the ACE FUIDs */ 197 for (zfuid = list_head(&fuidp->z_fuids); zfuid; 198 zfuid = list_next(&fuidp->z_fuids, zfuid)) { 199 *fuidloc++ = zfuid->z_logfuid; 200 } 201 return (fuidloc); 202 } 203 204 205 static void * 206 zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) 207 { 208 zfs_fuid_domain_t *zdomain; 209 210 /* now copy in the domain info, if any */ 211 if (fuidp->z_domain_str_sz != 0) { 212 for (zdomain = list_head(&fuidp->z_domains); zdomain; 213 zdomain = list_next(&fuidp->z_domains, zdomain)) { 214 memcpy(start, zdomain->z_domain, 215 strlen(zdomain->z_domain) + 1); 216 start = (caddr_t)start + 217 strlen(zdomain->z_domain) + 1; 218 } 219 } 220 return (start); 221 } 222 223 /* 224 * If zp is an xattr node, check whether the xattr owner is unlinked. 225 * We don't want to log anything if the owner is unlinked. 226 */ 227 static int 228 zfs_xattr_owner_unlinked(znode_t *zp) 229 { 230 int unlinked = 0; 231 znode_t *dzp; 232 #ifdef __FreeBSD__ 233 znode_t *tzp = zp; 234 235 /* 236 * zrele drops the vnode lock which violates the VOP locking contract 237 * on FreeBSD. See comment at the top of zfs_replay.c for more detail. 238 */ 239 /* 240 * if zp is XATTR node, keep walking up via z_xattr_parent until we 241 * get the owner 242 */ 243 while (tzp->z_pflags & ZFS_XATTR) { 244 ASSERT3U(zp->z_xattr_parent, !=, 0); 245 if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) { 246 unlinked = 1; 247 break; 248 } 249 250 if (tzp != zp) 251 zrele(tzp); 252 tzp = dzp; 253 unlinked = tzp->z_unlinked; 254 } 255 if (tzp != zp) 256 zrele(tzp); 257 #else 258 zhold(zp); 259 /* 260 * if zp is XATTR node, keep walking up via z_xattr_parent until we 261 * get the owner 262 */ 263 while (zp->z_pflags & ZFS_XATTR) { 264 ASSERT3U(zp->z_xattr_parent, !=, 0); 265 if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { 266 unlinked = 1; 267 break; 268 } 269 270 zrele(zp); 271 zp = dzp; 272 unlinked = zp->z_unlinked; 273 } 274 zrele(zp); 275 #endif 276 return (unlinked); 277 } 278 279 /* 280 * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and 281 * TK_MKXATTR transactions. 282 * 283 * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID 284 * domain information appended prior to the name. In this case the 285 * uid/gid in the log record will be a log centric FUID. 286 * 287 * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that 288 * may contain attributes, ACL and optional fuid information. 289 * 290 * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify 291 * and ACL and normal users/groups in the ACEs. 292 * 293 * There may be an optional xvattr attribute information similar 294 * to zfs_log_setattr. 295 * 296 * Also, after the file name "domain" strings may be appended. 297 */ 298 void 299 zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 300 znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *vsecp, 301 zfs_fuid_info_t *fuidp, vattr_t *vap) 302 { 303 itx_t *itx; 304 _lr_create_t *lr; 305 lr_acl_create_t *lracl = NULL; 306 uint8_t *lrdata; 307 size_t aclsize = 0; 308 size_t xvatsize = 0; 309 size_t txsize; 310 xvattr_t *xvap = (xvattr_t *)vap; 311 size_t namesize = strlen(name) + 1; 312 size_t fuidsz = 0; 313 314 if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) 315 return; 316 317 /* 318 * If we have FUIDs present then add in space for 319 * domains and ACE fuid's if any. 320 */ 321 if (fuidp) { 322 fuidsz += fuidp->z_domain_str_sz; 323 fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); 324 } 325 326 if (vap->va_mask & ATTR_XVATTR) 327 xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); 328 329 if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || 330 (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || 331 (int)txtype == TX_MKXATTR) { 332 txsize = sizeof (lr_create_t) + namesize + fuidsz + xvatsize; 333 itx = zil_itx_create(txtype, txsize); 334 lr_create_t *lrc = (lr_create_t *)&itx->itx_lr; 335 lrdata = &lrc->lr_data[0]; 336 } else { 337 txsize = 338 sizeof (lr_acl_create_t) + namesize + fuidsz + 339 ZIL_ACE_LENGTH(aclsize) + xvatsize; 340 itx = zil_itx_create(txtype, txsize); 341 lracl = (lr_acl_create_t *)&itx->itx_lr; 342 lrdata = &lracl->lr_data[0]; 343 } 344 345 346 lr = (_lr_create_t *)&itx->itx_lr; 347 lr->lr_doid = dzp->z_id; 348 lr->lr_foid = zp->z_id; 349 /* Store dnode slot count in 8 bits above object id. */ 350 LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); 351 lr->lr_mode = zp->z_mode; 352 if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) { 353 lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp)); 354 } else { 355 lr->lr_uid = fuidp->z_fuid_owner; 356 } 357 if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) { 358 lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp)); 359 } else { 360 lr->lr_gid = fuidp->z_fuid_group; 361 } 362 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, 363 sizeof (uint64_t)); 364 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 365 lr->lr_crtime, sizeof (uint64_t) * 2); 366 367 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev, 368 sizeof (lr->lr_rdev)) != 0) 369 lr->lr_rdev = 0; 370 371 /* 372 * Fill in xvattr info if any 373 */ 374 if (vap->va_mask & ATTR_XVATTR) { 375 zfs_log_xvattr((lr_attr_t *)lrdata, xvap); 376 lrdata = &lrdata[xvatsize]; 377 } 378 379 /* Now fill in any ACL info */ 380 381 if (vsecp) { 382 ASSERT3P(lracl, !=, NULL); 383 lracl->lr_aclcnt = vsecp->vsa_aclcnt; 384 lracl->lr_acl_bytes = aclsize; 385 lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; 386 lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; 387 if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) 388 lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; 389 else 390 lracl->lr_acl_flags = 0; 391 392 memcpy(lrdata, vsecp->vsa_aclentp, aclsize); 393 lrdata = &lrdata[ZIL_ACE_LENGTH(aclsize)]; 394 } 395 396 /* drop in FUID info */ 397 if (fuidp) { 398 lrdata = zfs_log_fuid_ids(fuidp, lrdata); 399 lrdata = zfs_log_fuid_domains(fuidp, lrdata); 400 } 401 /* 402 * Now place file name in log record 403 */ 404 memcpy(lrdata, name, namesize); 405 406 zil_itx_assign(zilog, itx, tx); 407 } 408 409 /* 410 * Handles both TX_REMOVE and TX_RMDIR transactions. 411 */ 412 void 413 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 414 znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked) 415 { 416 itx_t *itx; 417 lr_remove_t *lr; 418 size_t namesize = strlen(name) + 1; 419 420 if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) 421 return; 422 423 itx = zil_itx_create(txtype, sizeof (*lr) + namesize); 424 lr = (lr_remove_t *)&itx->itx_lr; 425 lr->lr_doid = dzp->z_id; 426 memcpy(&lr->lr_data[0], name, namesize); 427 428 itx->itx_oid = foid; 429 430 /* 431 * Object ids can be re-instantiated in the next txg so 432 * remove any async transactions to avoid future leaks. 433 * This can happen if a fsync occurs on the re-instantiated 434 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 435 * the new file data and flushes a write record for the old object. 436 */ 437 if (unlinked) { 438 ASSERT((txtype & ~TX_CI) == TX_REMOVE); 439 zil_remove_async(zilog, foid); 440 } 441 zil_itx_assign(zilog, itx, tx); 442 } 443 444 /* 445 * Handles TX_LINK transactions. 446 */ 447 void 448 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 449 znode_t *dzp, znode_t *zp, const char *name) 450 { 451 itx_t *itx; 452 lr_link_t *lr; 453 size_t namesize = strlen(name) + 1; 454 455 if (zil_replaying(zilog, tx)) 456 return; 457 458 itx = zil_itx_create(txtype, sizeof (*lr) + namesize); 459 lr = (lr_link_t *)&itx->itx_lr; 460 lr->lr_doid = dzp->z_id; 461 lr->lr_link_obj = zp->z_id; 462 memcpy(&lr->lr_data[0], name, namesize); 463 464 zil_itx_assign(zilog, itx, tx); 465 } 466 467 /* 468 * Handles TX_SYMLINK transactions. 469 */ 470 void 471 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 472 znode_t *dzp, znode_t *zp, const char *name, const char *link) 473 { 474 itx_t *itx; 475 _lr_create_t *lr; 476 lr_create_t *lrc; 477 size_t namesize = strlen(name) + 1; 478 size_t linksize = strlen(link) + 1; 479 480 if (zil_replaying(zilog, tx)) 481 return; 482 483 itx = zil_itx_create(txtype, sizeof (*lrc) + namesize + linksize); 484 lrc = (lr_create_t *)&itx->itx_lr; 485 lr = &lrc->lr_create; 486 lr->lr_doid = dzp->z_id; 487 lr->lr_foid = zp->z_id; 488 lr->lr_uid = KUID_TO_SUID(ZTOUID(zp)); 489 lr->lr_gid = KGID_TO_SGID(ZTOGID(zp)); 490 lr->lr_mode = zp->z_mode; 491 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, 492 sizeof (uint64_t)); 493 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 494 lr->lr_crtime, sizeof (uint64_t) * 2); 495 memcpy(&lrc->lr_data[0], name, namesize); 496 memcpy(&lrc->lr_data[namesize], link, linksize); 497 498 zil_itx_assign(zilog, itx, tx); 499 } 500 501 static void 502 do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, 503 const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) 504 { 505 itx_t *itx; 506 _lr_rename_t *lr; 507 lr_rename_t *lrr; 508 size_t snamesize = strlen(sname) + 1; 509 size_t dnamesize = strlen(dname) + 1; 510 511 if (zil_replaying(zilog, tx)) 512 return; 513 514 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); 515 lrr = (lr_rename_t *)&itx->itx_lr; 516 lr = &lrr->lr_rename; 517 lr->lr_sdoid = sdzp->z_id; 518 lr->lr_tdoid = tdzp->z_id; 519 memcpy(&lrr->lr_data[0], sname, snamesize); 520 memcpy(&lrr->lr_data[snamesize], dname, dnamesize); 521 itx->itx_oid = szp->z_id; 522 523 zil_itx_assign(zilog, itx, tx); 524 } 525 526 /* 527 * Handles TX_RENAME transactions. 528 */ 529 void 530 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, 531 const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) 532 { 533 txtype |= TX_RENAME; 534 do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); 535 } 536 537 /* 538 * Handles TX_RENAME_EXCHANGE transactions. 539 */ 540 void 541 zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 542 znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, 543 znode_t *szp) 544 { 545 txtype |= TX_RENAME_EXCHANGE; 546 do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); 547 } 548 549 /* 550 * Handles TX_RENAME_WHITEOUT transactions. 551 * 552 * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call 553 * zfs_mknode() on replay which requires stashing bits as with TX_CREATE. 554 */ 555 void 556 zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 557 znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, 558 znode_t *szp, znode_t *wzp) 559 { 560 itx_t *itx; 561 lr_rename_whiteout_t *lr; 562 size_t snamesize = strlen(sname) + 1; 563 size_t dnamesize = strlen(dname) + 1; 564 565 if (zil_replaying(zilog, tx)) 566 return; 567 568 txtype |= TX_RENAME_WHITEOUT; 569 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); 570 lr = (lr_rename_whiteout_t *)&itx->itx_lr; 571 lr->lr_rename.lr_sdoid = sdzp->z_id; 572 lr->lr_rename.lr_tdoid = tdzp->z_id; 573 574 /* 575 * RENAME_WHITEOUT will create an entry at the source znode, so we need 576 * to store the same data that the equivalent call to zfs_log_create() 577 * would. 578 */ 579 lr->lr_wfoid = wzp->z_id; 580 LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT); 581 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen, 582 sizeof (uint64_t)); 583 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)), 584 lr->lr_wcrtime, sizeof (uint64_t) * 2); 585 lr->lr_wmode = wzp->z_mode; 586 lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp)); 587 lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp)); 588 589 /* 590 * This rdev will always be makdevice(0, 0) but because the ZIL log and 591 * replay code needs to be platform independent (and there is no 592 * platform independent makdev()) we need to copy the one created 593 * during the rename operation. 594 */ 595 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev, 596 sizeof (lr->lr_wrdev)); 597 598 memcpy(&lr->lr_data[0], sname, snamesize); 599 memcpy(&lr->lr_data[snamesize], dname, dnamesize); 600 itx->itx_oid = szp->z_id; 601 602 zil_itx_assign(zilog, itx, tx); 603 } 604 605 /* 606 * zfs_log_write() handles TX_WRITE transactions. The specified callback is 607 * called as soon as the write is on stable storage (be it via a DMU sync or a 608 * ZIL commit). 609 */ 610 void 611 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, 612 znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, 613 boolean_t o_direct, zil_callback_t callback, void *callback_data) 614 { 615 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 616 uint32_t blocksize = zp->z_blksz; 617 itx_wr_state_t write_state; 618 uint64_t gen = 0, log_size = 0; 619 620 if (zil_replaying(zilog, tx) || zp->z_unlinked || 621 zfs_xattr_owner_unlinked(zp)) { 622 if (callback != NULL) 623 callback(callback_data, 0); 624 return; 625 } 626 627 write_state = zil_write_state(zilog, resid, blocksize, o_direct, 628 commit); 629 630 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, 631 sizeof (gen)); 632 633 while (resid) { 634 itx_t *itx; 635 lr_write_t *lr; 636 itx_wr_state_t wr_state = write_state; 637 ssize_t len = resid; 638 639 /* 640 * A WR_COPIED record must fit entirely in one log block. 641 * Large writes can use WR_NEED_COPY, which the ZIL will 642 * split into multiple records across several log blocks 643 * if necessary. 644 */ 645 if (wr_state == WR_COPIED && 646 resid > zil_max_copied_data(zilog)) 647 wr_state = WR_NEED_COPY; 648 else if (wr_state == WR_INDIRECT) 649 len = MIN(blocksize - P2PHASE(off, blocksize), resid); 650 651 itx = zil_itx_create(txtype, sizeof (*lr) + 652 (wr_state == WR_COPIED ? len : 0)); 653 lr = (lr_write_t *)&itx->itx_lr; 654 655 /* 656 * For WR_COPIED records, copy the data into the lr_write_t. 657 */ 658 if (wr_state == WR_COPIED) { 659 int err; 660 DB_DNODE_ENTER(db); 661 err = dmu_read_by_dnode(DB_DNODE(db), off, len, 662 &lr->lr_data[0], DMU_READ_NO_PREFETCH | 663 DMU_KEEP_CACHING); 664 DB_DNODE_EXIT(db); 665 if (err != 0) { 666 zil_itx_destroy(itx, 0); 667 itx = zil_itx_create(txtype, sizeof (*lr)); 668 lr = (lr_write_t *)&itx->itx_lr; 669 wr_state = WR_NEED_COPY; 670 } 671 } 672 673 log_size += itx->itx_size; 674 if (wr_state == WR_NEED_COPY) 675 log_size += len; 676 677 itx->itx_wr_state = wr_state; 678 lr->lr_foid = zp->z_id; 679 lr->lr_offset = off; 680 lr->lr_length = len; 681 lr->lr_blkoff = 0; 682 BP_ZERO(&lr->lr_blkptr); 683 684 itx->itx_private = ZTOZSB(zp); 685 itx->itx_sync = (zp->z_sync_cnt != 0); 686 itx->itx_gen = gen; 687 688 if (resid == len) { 689 itx->itx_callback = callback; 690 itx->itx_callback_data = callback_data; 691 } 692 693 zil_itx_assign(zilog, itx, tx); 694 695 off += len; 696 resid -= len; 697 } 698 699 dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg); 700 } 701 702 /* 703 * Handles TX_TRUNCATE transactions. 704 */ 705 void 706 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, 707 znode_t *zp, uint64_t off, uint64_t len) 708 { 709 itx_t *itx; 710 lr_truncate_t *lr; 711 712 if (zil_replaying(zilog, tx) || zp->z_unlinked || 713 zfs_xattr_owner_unlinked(zp)) 714 return; 715 716 itx = zil_itx_create(txtype, sizeof (*lr)); 717 lr = (lr_truncate_t *)&itx->itx_lr; 718 lr->lr_foid = zp->z_id; 719 lr->lr_offset = off; 720 lr->lr_length = len; 721 722 itx->itx_sync = (zp->z_sync_cnt != 0); 723 zil_itx_assign(zilog, itx, tx); 724 } 725 726 /* 727 * Handles TX_SETATTR transactions. 728 */ 729 void 730 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, 731 znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) 732 { 733 itx_t *itx; 734 lr_setattr_t *lr; 735 xvattr_t *xvap = (xvattr_t *)vap; 736 size_t recsize = sizeof (lr_setattr_t); 737 uint8_t *start; 738 739 if (zil_replaying(zilog, tx) || zp->z_unlinked) 740 return; 741 742 /* 743 * If XVATTR set, then log record size needs to allow 744 * for lr_attr_t + xvattr mask, mapsize and create time 745 * plus actual attribute values 746 */ 747 if (vap->va_mask & ATTR_XVATTR) 748 recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); 749 750 if (fuidp) 751 recsize += fuidp->z_domain_str_sz; 752 753 itx = zil_itx_create(txtype, recsize); 754 lr = (lr_setattr_t *)&itx->itx_lr; 755 lr->lr_foid = zp->z_id; 756 lr->lr_mask = (uint64_t)mask_applied; 757 lr->lr_mode = (uint64_t)vap->va_mode; 758 if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) 759 lr->lr_uid = fuidp->z_fuid_owner; 760 else 761 lr->lr_uid = (uint64_t)vap->va_uid; 762 763 if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) 764 lr->lr_gid = fuidp->z_fuid_group; 765 else 766 lr->lr_gid = (uint64_t)vap->va_gid; 767 768 lr->lr_size = (uint64_t)vap->va_size; 769 ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); 770 ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); 771 start = &lr->lr_data[0]; 772 if (vap->va_mask & ATTR_XVATTR) { 773 zfs_log_xvattr((lr_attr_t *)start, xvap); 774 start = &lr->lr_data[ZIL_XVAT_SIZE(xvap->xva_mapsize)]; 775 } 776 777 /* 778 * Now stick on domain information if any on end 779 */ 780 781 if (fuidp) 782 (void) zfs_log_fuid_domains(fuidp, start); 783 784 itx->itx_sync = (zp->z_sync_cnt != 0); 785 zil_itx_assign(zilog, itx, tx); 786 } 787 788 /* 789 * Handles TX_SETSAXATTR transactions. 790 */ 791 void 792 zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, 793 znode_t *zp, const char *name, const void *value, size_t size) 794 { 795 itx_t *itx; 796 lr_setsaxattr_t *lr; 797 size_t recsize = sizeof (lr_setsaxattr_t); 798 int namelen; 799 800 if (zil_replaying(zilog, tx) || zp->z_unlinked) 801 return; 802 803 namelen = strlen(name) + 1; 804 recsize += (namelen + size); 805 itx = zil_itx_create(txtype, recsize); 806 lr = (lr_setsaxattr_t *)&itx->itx_lr; 807 lr->lr_foid = zp->z_id; 808 memcpy(&lr->lr_data[0], name, namelen); 809 if (value != NULL) { 810 memcpy(&lr->lr_data[namelen], value, size); 811 lr->lr_size = size; 812 } else { 813 lr->lr_size = 0; 814 } 815 816 itx->itx_sync = (zp->z_sync_cnt != 0); 817 zil_itx_assign(zilog, itx, tx); 818 } 819 820 /* 821 * Handles TX_ACL transactions. 822 */ 823 void 824 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, 825 vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) 826 { 827 itx_t *itx; 828 lr_acl_v0_t *lrv0; 829 lr_acl_t *lr; 830 int txtype; 831 int lrsize; 832 size_t txsize; 833 size_t aclbytes = vsecp->vsa_aclentsz; 834 835 if (zil_replaying(zilog, tx) || zp->z_unlinked) 836 return; 837 838 txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ? 839 TX_ACL_V0 : TX_ACL; 840 841 if (txtype == TX_ACL) 842 lrsize = sizeof (*lr); 843 else 844 lrsize = sizeof (*lrv0); 845 846 txsize = lrsize + 847 ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + 848 (fuidp ? fuidp->z_domain_str_sz : 0) + 849 sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); 850 851 itx = zil_itx_create(txtype, txsize); 852 853 lr = (lr_acl_t *)&itx->itx_lr; 854 lr->lr_foid = zp->z_id; 855 if (txtype == TX_ACL) { 856 lr->lr_acl_bytes = aclbytes; 857 lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; 858 lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; 859 if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) 860 lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; 861 else 862 lr->lr_acl_flags = 0; 863 } 864 lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; 865 866 if (txtype == TX_ACL_V0) { 867 lrv0 = (lr_acl_v0_t *)lr; 868 memcpy(&lrv0->lr_data[0], vsecp->vsa_aclentp, aclbytes); 869 } else { 870 uint8_t *start = &lr->lr_data[0]; 871 872 memcpy(start, vsecp->vsa_aclentp, aclbytes); 873 874 start = &lr->lr_data[ZIL_ACE_LENGTH(aclbytes)]; 875 876 if (fuidp) { 877 start = zfs_log_fuid_ids(fuidp, start); 878 (void) zfs_log_fuid_domains(fuidp, start); 879 } 880 } 881 882 itx->itx_sync = (zp->z_sync_cnt != 0); 883 zil_itx_assign(zilog, itx, tx); 884 } 885 886 /* 887 * Handles TX_CLONE_RANGE transactions. 888 */ 889 void 890 zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, 891 uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, 892 size_t nbps) 893 { 894 itx_t *itx; 895 lr_clone_range_t *lr; 896 uint64_t partlen, max_log_data; 897 size_t partnbps; 898 899 if (zil_replaying(zilog, tx) || zp->z_unlinked) 900 return; 901 902 max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); 903 904 while (nbps > 0) { 905 partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); 906 partlen = partnbps * blksz; 907 ASSERT3U(partlen, <, len + blksz); 908 partlen = MIN(partlen, len); 909 910 itx = zil_itx_create(txtype, 911 sizeof (*lr) + sizeof (bps[0]) * partnbps); 912 lr = (lr_clone_range_t *)&itx->itx_lr; 913 lr->lr_foid = zp->z_id; 914 lr->lr_offset = off; 915 lr->lr_length = partlen; 916 lr->lr_blksz = blksz; 917 lr->lr_nbps = partnbps; 918 memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); 919 920 itx->itx_sync = (zp->z_sync_cnt != 0); 921 922 zil_itx_assign(zilog, itx, tx); 923 924 bps += partnbps; 925 ASSERT3U(nbps, >=, partnbps); 926 nbps -= partnbps; 927 off += partlen; 928 ASSERT3U(len, >=, partlen); 929 len -= partlen; 930 } 931 } 932