1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2015, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2022 by Pawel Jakub Dawidek 26 */ 27 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/cmn_err.h> 33 #include <sys/kmem.h> 34 #include <sys/thread.h> 35 #include <sys/file.h> 36 #include <sys/vfs.h> 37 #include <sys/zfs_znode.h> 38 #include <sys/zfs_dir.h> 39 #include <sys/zil.h> 40 #include <sys/zil_impl.h> 41 #include <sys/byteorder.h> 42 #include <sys/policy.h> 43 #include <sys/stat.h> 44 #include <sys/acl.h> 45 #include <sys/dmu.h> 46 #include <sys/dbuf.h> 47 #include <sys/spa.h> 48 #include <sys/zfs_fuid.h> 49 #include <sys/dsl_dataset.h> 50 51 /* 52 * These zfs_log_* functions must be called within a dmu tx, in one 53 * of 2 contexts depending on zilog->z_replay: 54 * 55 * Non replay mode 56 * --------------- 57 * We need to record the transaction so that if it is committed to 58 * the Intent Log then it can be replayed. An intent log transaction 59 * structure (itx_t) is allocated and all the information necessary to 60 * possibly replay the transaction is saved in it. The itx is then assigned 61 * a sequence number and inserted in the in-memory list anchored in the zilog. 62 * 63 * Replay mode 64 * ----------- 65 * We need to mark the intent log record as replayed in the log header. 66 * This is done in the same transaction as the replay so that they 67 * commit atomically. 68 */ 69 70 int 71 zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) 72 { 73 int isxvattr = (vap->va_mask & ATTR_XVATTR); 74 switch (type) { 75 case Z_FILE: 76 if (vsecp == NULL && !isxvattr) 77 return (TX_CREATE); 78 if (vsecp && isxvattr) 79 return (TX_CREATE_ACL_ATTR); 80 if (vsecp) 81 return (TX_CREATE_ACL); 82 else 83 return (TX_CREATE_ATTR); 84 case Z_DIR: 85 if (vsecp == NULL && !isxvattr) 86 return (TX_MKDIR); 87 if (vsecp && isxvattr) 88 return (TX_MKDIR_ACL_ATTR); 89 if (vsecp) 90 return (TX_MKDIR_ACL); 91 else 92 return (TX_MKDIR_ATTR); 93 case Z_XATTRDIR: 94 return (TX_MKXATTR); 95 } 96 ASSERT(0); 97 return (TX_MAX_TYPE); 98 } 99 100 /* 101 * build up the log data necessary for logging xvattr_t 102 * First lr_attr_t is initialized. following the lr_attr_t 103 * is the mapsize and attribute bitmap copied from the xvattr_t. 104 * Following the bitmap and bitmapsize two 64 bit words are reserved 105 * for the create time which may be set. Following the create time 106 * records a single 64 bit integer which has the bits to set on 107 * replay for the xvattr. 108 */ 109 static void 110 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) 111 { 112 xoptattr_t *xoap; 113 114 xoap = xva_getxoptattr(xvap); 115 ASSERT(xoap); 116 117 lrattr->lr_attr_masksize = xvap->xva_mapsize; 118 uint32_t *bitmap = &lrattr->lr_attr_bitmap; 119 for (int i = 0; i != xvap->xva_mapsize; i++, bitmap++) 120 *bitmap = xvap->xva_reqattrmap[i]; 121 122 lr_attr_end_t *end = (lr_attr_end_t *)bitmap; 123 end->lr_attr_attrs = 0; 124 end->lr_attr_crtime[0] = 0; 125 end->lr_attr_crtime[1] = 0; 126 memset(end->lr_attr_scanstamp, 0, AV_SCANSTAMP_SZ); 127 128 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) 129 end->lr_attr_attrs |= (xoap->xoa_readonly == 0) ? 0 : 130 XAT0_READONLY; 131 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) 132 end->lr_attr_attrs |= (xoap->xoa_hidden == 0) ? 0 : 133 XAT0_HIDDEN; 134 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) 135 end->lr_attr_attrs |= (xoap->xoa_system == 0) ? 0 : 136 XAT0_SYSTEM; 137 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) 138 end->lr_attr_attrs |= (xoap->xoa_archive == 0) ? 0 : 139 XAT0_ARCHIVE; 140 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) 141 end->lr_attr_attrs |= (xoap->xoa_immutable == 0) ? 0 : 142 XAT0_IMMUTABLE; 143 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) 144 end->lr_attr_attrs |= (xoap->xoa_nounlink == 0) ? 0 : 145 XAT0_NOUNLINK; 146 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) 147 end->lr_attr_attrs |= (xoap->xoa_appendonly == 0) ? 0 : 148 XAT0_APPENDONLY; 149 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) 150 end->lr_attr_attrs |= (xoap->xoa_opaque == 0) ? 0 : 151 XAT0_APPENDONLY; 152 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) 153 end->lr_attr_attrs |= (xoap->xoa_nodump == 0) ? 0 : 154 XAT0_NODUMP; 155 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) 156 end->lr_attr_attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : 157 XAT0_AV_QUARANTINED; 158 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) 159 end->lr_attr_attrs |= (xoap->xoa_av_modified == 0) ? 0 : 160 XAT0_AV_MODIFIED; 161 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 162 ZFS_TIME_ENCODE(&xoap->xoa_createtime, end->lr_attr_crtime); 163 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 164 ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); 165 166 memcpy(end->lr_attr_scanstamp, xoap->xoa_av_scanstamp, 167 AV_SCANSTAMP_SZ); 168 } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 169 /* 170 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid 171 * at the same time, so we can share the same space. 172 */ 173 memcpy(end->lr_attr_scanstamp, &xoap->xoa_projid, 174 sizeof (uint64_t)); 175 } 176 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) 177 end->lr_attr_attrs |= (xoap->xoa_reparse == 0) ? 0 : 178 XAT0_REPARSE; 179 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) 180 end->lr_attr_attrs |= (xoap->xoa_offline == 0) ? 0 : 181 XAT0_OFFLINE; 182 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) 183 end->lr_attr_attrs |= (xoap->xoa_sparse == 0) ? 0 : 184 XAT0_SPARSE; 185 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) 186 end->lr_attr_attrs |= (xoap->xoa_projinherit == 0) ? 0 : 187 XAT0_PROJINHERIT; 188 } 189 190 static void * 191 zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) 192 { 193 zfs_fuid_t *zfuid; 194 uint64_t *fuidloc = start; 195 196 /* First copy in the ACE FUIDs */ 197 for (zfuid = list_head(&fuidp->z_fuids); zfuid; 198 zfuid = list_next(&fuidp->z_fuids, zfuid)) { 199 *fuidloc++ = zfuid->z_logfuid; 200 } 201 return (fuidloc); 202 } 203 204 205 static void * 206 zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) 207 { 208 zfs_fuid_domain_t *zdomain; 209 210 /* now copy in the domain info, if any */ 211 if (fuidp->z_domain_str_sz != 0) { 212 for (zdomain = list_head(&fuidp->z_domains); zdomain; 213 zdomain = list_next(&fuidp->z_domains, zdomain)) { 214 memcpy(start, zdomain->z_domain, 215 strlen(zdomain->z_domain) + 1); 216 start = (caddr_t)start + 217 strlen(zdomain->z_domain) + 1; 218 } 219 } 220 return (start); 221 } 222 223 /* 224 * If zp is an xattr node, check whether the xattr owner is unlinked. 225 * We don't want to log anything if the owner is unlinked. 226 */ 227 static int 228 zfs_xattr_owner_unlinked(znode_t *zp) 229 { 230 int unlinked = 0; 231 znode_t *dzp; 232 #ifdef __FreeBSD__ 233 znode_t *tzp = zp; 234 235 /* 236 * zrele drops the vnode lock which violates the VOP locking contract 237 * on FreeBSD. See comment at the top of zfs_replay.c for more detail. 238 */ 239 /* 240 * if zp is XATTR node, keep walking up via z_xattr_parent until we 241 * get the owner 242 */ 243 while (tzp->z_pflags & ZFS_XATTR) { 244 ASSERT3U(zp->z_xattr_parent, !=, 0); 245 if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) { 246 unlinked = 1; 247 break; 248 } 249 250 if (tzp != zp) 251 zrele(tzp); 252 tzp = dzp; 253 unlinked = tzp->z_unlinked; 254 } 255 if (tzp != zp) 256 zrele(tzp); 257 #else 258 zhold(zp); 259 /* 260 * if zp is XATTR node, keep walking up via z_xattr_parent until we 261 * get the owner 262 */ 263 while (zp->z_pflags & ZFS_XATTR) { 264 ASSERT3U(zp->z_xattr_parent, !=, 0); 265 if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { 266 unlinked = 1; 267 break; 268 } 269 270 zrele(zp); 271 zp = dzp; 272 unlinked = zp->z_unlinked; 273 } 274 zrele(zp); 275 #endif 276 return (unlinked); 277 } 278 279 /* 280 * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and 281 * TK_MKXATTR transactions. 282 * 283 * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID 284 * domain information appended prior to the name. In this case the 285 * uid/gid in the log record will be a log centric FUID. 286 * 287 * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that 288 * may contain attributes, ACL and optional fuid information. 289 * 290 * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify 291 * and ACL and normal users/groups in the ACEs. 292 * 293 * There may be an optional xvattr attribute information similar 294 * to zfs_log_setattr. 295 * 296 * Also, after the file name "domain" strings may be appended. 297 */ 298 void 299 zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 300 znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *vsecp, 301 zfs_fuid_info_t *fuidp, vattr_t *vap) 302 { 303 itx_t *itx; 304 _lr_create_t *lr; 305 lr_acl_create_t *lracl = NULL; 306 uint8_t *lrdata; 307 size_t aclsize = 0; 308 size_t xvatsize = 0; 309 size_t txsize; 310 xvattr_t *xvap = (xvattr_t *)vap; 311 size_t namesize = strlen(name) + 1; 312 size_t fuidsz = 0; 313 314 if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) 315 return; 316 317 /* 318 * If we have FUIDs present then add in space for 319 * domains and ACE fuid's if any. 320 */ 321 if (fuidp) { 322 fuidsz += fuidp->z_domain_str_sz; 323 fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); 324 } 325 326 if (vap->va_mask & ATTR_XVATTR) 327 xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); 328 329 if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || 330 (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || 331 (int)txtype == TX_MKXATTR) { 332 txsize = sizeof (lr_create_t) + namesize + fuidsz + xvatsize; 333 itx = zil_itx_create(txtype, txsize); 334 lr_create_t *lrc = (lr_create_t *)&itx->itx_lr; 335 lrdata = &lrc->lr_data[0]; 336 } else { 337 txsize = 338 sizeof (lr_acl_create_t) + namesize + fuidsz + 339 ZIL_ACE_LENGTH(aclsize) + xvatsize; 340 itx = zil_itx_create(txtype, txsize); 341 lracl = (lr_acl_create_t *)&itx->itx_lr; 342 lrdata = &lracl->lr_data[0]; 343 } 344 345 346 lr = (_lr_create_t *)&itx->itx_lr; 347 lr->lr_doid = dzp->z_id; 348 lr->lr_foid = zp->z_id; 349 /* Store dnode slot count in 8 bits above object id. */ 350 LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); 351 lr->lr_mode = zp->z_mode; 352 if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) { 353 lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp)); 354 } else { 355 lr->lr_uid = fuidp->z_fuid_owner; 356 } 357 if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) { 358 lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp)); 359 } else { 360 lr->lr_gid = fuidp->z_fuid_group; 361 } 362 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, 363 sizeof (uint64_t)); 364 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 365 lr->lr_crtime, sizeof (uint64_t) * 2); 366 367 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev, 368 sizeof (lr->lr_rdev)) != 0) 369 lr->lr_rdev = 0; 370 371 /* 372 * Fill in xvattr info if any 373 */ 374 if (vap->va_mask & ATTR_XVATTR) { 375 zfs_log_xvattr((lr_attr_t *)lrdata, xvap); 376 lrdata = &lrdata[xvatsize]; 377 } 378 379 /* Now fill in any ACL info */ 380 381 if (vsecp) { 382 ASSERT3P(lracl, !=, NULL); 383 lracl->lr_aclcnt = vsecp->vsa_aclcnt; 384 lracl->lr_acl_bytes = aclsize; 385 lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; 386 lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; 387 if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) 388 lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; 389 else 390 lracl->lr_acl_flags = 0; 391 392 memcpy(lrdata, vsecp->vsa_aclentp, aclsize); 393 lrdata = &lrdata[ZIL_ACE_LENGTH(aclsize)]; 394 } 395 396 /* drop in FUID info */ 397 if (fuidp) { 398 lrdata = zfs_log_fuid_ids(fuidp, lrdata); 399 lrdata = zfs_log_fuid_domains(fuidp, lrdata); 400 } 401 /* 402 * Now place file name in log record 403 */ 404 memcpy(lrdata, name, namesize); 405 406 zil_itx_assign(zilog, itx, tx); 407 } 408 409 /* 410 * Handles both TX_REMOVE and TX_RMDIR transactions. 411 */ 412 void 413 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 414 znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked) 415 { 416 itx_t *itx; 417 lr_remove_t *lr; 418 size_t namesize = strlen(name) + 1; 419 420 if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) 421 return; 422 423 itx = zil_itx_create(txtype, sizeof (*lr) + namesize); 424 lr = (lr_remove_t *)&itx->itx_lr; 425 lr->lr_doid = dzp->z_id; 426 memcpy(&lr->lr_data[0], name, namesize); 427 428 itx->itx_oid = foid; 429 430 /* 431 * Object ids can be re-instantiated in the next txg so 432 * remove any async transactions to avoid future leaks. 433 * This can happen if a fsync occurs on the re-instantiated 434 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets 435 * the new file data and flushes a write record for the old object. 436 */ 437 if (unlinked) { 438 ASSERT((txtype & ~TX_CI) == TX_REMOVE); 439 zil_remove_async(zilog, foid); 440 } 441 zil_itx_assign(zilog, itx, tx); 442 } 443 444 /* 445 * Handles TX_LINK transactions. 446 */ 447 void 448 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 449 znode_t *dzp, znode_t *zp, const char *name) 450 { 451 itx_t *itx; 452 lr_link_t *lr; 453 size_t namesize = strlen(name) + 1; 454 455 if (zil_replaying(zilog, tx)) 456 return; 457 458 itx = zil_itx_create(txtype, sizeof (*lr) + namesize); 459 lr = (lr_link_t *)&itx->itx_lr; 460 lr->lr_doid = dzp->z_id; 461 lr->lr_link_obj = zp->z_id; 462 memcpy(&lr->lr_data[0], name, namesize); 463 464 zil_itx_assign(zilog, itx, tx); 465 } 466 467 /* 468 * Handles TX_SYMLINK transactions. 469 */ 470 void 471 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 472 znode_t *dzp, znode_t *zp, const char *name, const char *link) 473 { 474 itx_t *itx; 475 _lr_create_t *lr; 476 lr_create_t *lrc; 477 size_t namesize = strlen(name) + 1; 478 size_t linksize = strlen(link) + 1; 479 480 if (zil_replaying(zilog, tx)) 481 return; 482 483 itx = zil_itx_create(txtype, sizeof (*lrc) + namesize + linksize); 484 lrc = (lr_create_t *)&itx->itx_lr; 485 lr = &lrc->lr_create; 486 lr->lr_doid = dzp->z_id; 487 lr->lr_foid = zp->z_id; 488 lr->lr_uid = KUID_TO_SUID(ZTOUID(zp)); 489 lr->lr_gid = KGID_TO_SGID(ZTOGID(zp)); 490 lr->lr_mode = zp->z_mode; 491 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, 492 sizeof (uint64_t)); 493 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 494 lr->lr_crtime, sizeof (uint64_t) * 2); 495 memcpy(&lrc->lr_data[0], name, namesize); 496 memcpy(&lrc->lr_data[namesize], link, linksize); 497 498 zil_itx_assign(zilog, itx, tx); 499 } 500 501 static void 502 do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, 503 const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) 504 { 505 itx_t *itx; 506 _lr_rename_t *lr; 507 lr_rename_t *lrr; 508 size_t snamesize = strlen(sname) + 1; 509 size_t dnamesize = strlen(dname) + 1; 510 511 if (zil_replaying(zilog, tx)) 512 return; 513 514 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); 515 lrr = (lr_rename_t *)&itx->itx_lr; 516 lr = &lrr->lr_rename; 517 lr->lr_sdoid = sdzp->z_id; 518 lr->lr_tdoid = tdzp->z_id; 519 memcpy(&lrr->lr_data[0], sname, snamesize); 520 memcpy(&lrr->lr_data[snamesize], dname, dnamesize); 521 itx->itx_oid = szp->z_id; 522 523 zil_itx_assign(zilog, itx, tx); 524 } 525 526 /* 527 * Handles TX_RENAME transactions. 528 */ 529 void 530 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, 531 const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) 532 { 533 txtype |= TX_RENAME; 534 do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); 535 } 536 537 /* 538 * Handles TX_RENAME_EXCHANGE transactions. 539 */ 540 void 541 zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 542 znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, 543 znode_t *szp) 544 { 545 txtype |= TX_RENAME_EXCHANGE; 546 do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); 547 } 548 549 /* 550 * Handles TX_RENAME_WHITEOUT transactions. 551 * 552 * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call 553 * zfs_mknode() on replay which requires stashing bits as with TX_CREATE. 554 */ 555 void 556 zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, 557 znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, 558 znode_t *szp, znode_t *wzp) 559 { 560 itx_t *itx; 561 lr_rename_whiteout_t *lr; 562 size_t snamesize = strlen(sname) + 1; 563 size_t dnamesize = strlen(dname) + 1; 564 565 if (zil_replaying(zilog, tx)) 566 return; 567 568 txtype |= TX_RENAME_WHITEOUT; 569 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); 570 lr = (lr_rename_whiteout_t *)&itx->itx_lr; 571 lr->lr_rename.lr_sdoid = sdzp->z_id; 572 lr->lr_rename.lr_tdoid = tdzp->z_id; 573 574 /* 575 * RENAME_WHITEOUT will create an entry at the source znode, so we need 576 * to store the same data that the equivalent call to zfs_log_create() 577 * would. 578 */ 579 lr->lr_wfoid = wzp->z_id; 580 LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT); 581 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen, 582 sizeof (uint64_t)); 583 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)), 584 lr->lr_wcrtime, sizeof (uint64_t) * 2); 585 lr->lr_wmode = wzp->z_mode; 586 lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp)); 587 lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp)); 588 589 /* 590 * This rdev will always be makdevice(0, 0) but because the ZIL log and 591 * replay code needs to be platform independent (and there is no 592 * platform independent makdev()) we need to copy the one created 593 * during the rename operation. 594 */ 595 (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev, 596 sizeof (lr->lr_wrdev)); 597 598 memcpy(&lr->lr_data[0], sname, snamesize); 599 memcpy(&lr->lr_data[snamesize], dname, dnamesize); 600 itx->itx_oid = szp->z_id; 601 602 zil_itx_assign(zilog, itx, tx); 603 } 604 605 /* 606 * zfs_log_write() handles TX_WRITE transactions. The specified callback is 607 * called as soon as the write is on stable storage (be it via a DMU sync or a 608 * ZIL commit). 609 */ 610 static uint_t zfs_immediate_write_sz = 32768; 611 612 void 613 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, 614 znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, 615 boolean_t o_direct, zil_callback_t callback, void *callback_data) 616 { 617 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 618 uint32_t blocksize = zp->z_blksz; 619 itx_wr_state_t write_state; 620 uint64_t gen = 0, log_size = 0; 621 622 if (zil_replaying(zilog, tx) || zp->z_unlinked || 623 zfs_xattr_owner_unlinked(zp)) { 624 if (callback != NULL) 625 callback(callback_data); 626 return; 627 } 628 629 if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) 630 write_state = WR_INDIRECT; 631 else if (!spa_has_slogs(zilog->zl_spa) && 632 resid >= zfs_immediate_write_sz) 633 write_state = WR_INDIRECT; 634 else if (commit) 635 write_state = WR_COPIED; 636 else 637 write_state = WR_NEED_COPY; 638 639 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, 640 sizeof (gen)); 641 642 while (resid) { 643 itx_t *itx; 644 lr_write_t *lr; 645 itx_wr_state_t wr_state = write_state; 646 ssize_t len = resid; 647 648 /* 649 * A WR_COPIED record must fit entirely in one log block. 650 * Large writes can use WR_NEED_COPY, which the ZIL will 651 * split into multiple records across several log blocks 652 * if necessary. 653 */ 654 if (wr_state == WR_COPIED && 655 resid > zil_max_copied_data(zilog)) 656 wr_state = WR_NEED_COPY; 657 else if (wr_state == WR_INDIRECT) 658 len = MIN(blocksize - P2PHASE(off, blocksize), resid); 659 660 itx = zil_itx_create(txtype, sizeof (*lr) + 661 (wr_state == WR_COPIED ? len : 0)); 662 lr = (lr_write_t *)&itx->itx_lr; 663 664 /* 665 * For WR_COPIED records, copy the data into the lr_write_t. 666 */ 667 if (wr_state == WR_COPIED) { 668 int err; 669 DB_DNODE_ENTER(db); 670 err = dmu_read_by_dnode(DB_DNODE(db), off, len, 671 &lr->lr_data[0], DMU_READ_NO_PREFETCH | 672 DMU_KEEP_CACHING); 673 DB_DNODE_EXIT(db); 674 if (err != 0) { 675 zil_itx_destroy(itx); 676 itx = zil_itx_create(txtype, sizeof (*lr)); 677 lr = (lr_write_t *)&itx->itx_lr; 678 wr_state = WR_NEED_COPY; 679 } 680 } 681 682 log_size += itx->itx_size; 683 if (wr_state == WR_NEED_COPY) 684 log_size += len; 685 686 itx->itx_wr_state = wr_state; 687 lr->lr_foid = zp->z_id; 688 lr->lr_offset = off; 689 lr->lr_length = len; 690 lr->lr_blkoff = 0; 691 BP_ZERO(&lr->lr_blkptr); 692 693 itx->itx_private = ZTOZSB(zp); 694 itx->itx_sync = (zp->z_sync_cnt != 0); 695 itx->itx_gen = gen; 696 697 itx->itx_callback = callback; 698 itx->itx_callback_data = callback_data; 699 zil_itx_assign(zilog, itx, tx); 700 701 off += len; 702 resid -= len; 703 } 704 705 dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg); 706 } 707 708 /* 709 * Handles TX_TRUNCATE transactions. 710 */ 711 void 712 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, 713 znode_t *zp, uint64_t off, uint64_t len) 714 { 715 itx_t *itx; 716 lr_truncate_t *lr; 717 718 if (zil_replaying(zilog, tx) || zp->z_unlinked || 719 zfs_xattr_owner_unlinked(zp)) 720 return; 721 722 itx = zil_itx_create(txtype, sizeof (*lr)); 723 lr = (lr_truncate_t *)&itx->itx_lr; 724 lr->lr_foid = zp->z_id; 725 lr->lr_offset = off; 726 lr->lr_length = len; 727 728 itx->itx_sync = (zp->z_sync_cnt != 0); 729 zil_itx_assign(zilog, itx, tx); 730 } 731 732 /* 733 * Handles TX_SETATTR transactions. 734 */ 735 void 736 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, 737 znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) 738 { 739 itx_t *itx; 740 lr_setattr_t *lr; 741 xvattr_t *xvap = (xvattr_t *)vap; 742 size_t recsize = sizeof (lr_setattr_t); 743 uint8_t *start; 744 745 if (zil_replaying(zilog, tx) || zp->z_unlinked) 746 return; 747 748 /* 749 * If XVATTR set, then log record size needs to allow 750 * for lr_attr_t + xvattr mask, mapsize and create time 751 * plus actual attribute values 752 */ 753 if (vap->va_mask & ATTR_XVATTR) 754 recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); 755 756 if (fuidp) 757 recsize += fuidp->z_domain_str_sz; 758 759 itx = zil_itx_create(txtype, recsize); 760 lr = (lr_setattr_t *)&itx->itx_lr; 761 lr->lr_foid = zp->z_id; 762 lr->lr_mask = (uint64_t)mask_applied; 763 lr->lr_mode = (uint64_t)vap->va_mode; 764 if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) 765 lr->lr_uid = fuidp->z_fuid_owner; 766 else 767 lr->lr_uid = (uint64_t)vap->va_uid; 768 769 if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) 770 lr->lr_gid = fuidp->z_fuid_group; 771 else 772 lr->lr_gid = (uint64_t)vap->va_gid; 773 774 lr->lr_size = (uint64_t)vap->va_size; 775 ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); 776 ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); 777 start = &lr->lr_data[0]; 778 if (vap->va_mask & ATTR_XVATTR) { 779 zfs_log_xvattr((lr_attr_t *)start, xvap); 780 start = &lr->lr_data[ZIL_XVAT_SIZE(xvap->xva_mapsize)]; 781 } 782 783 /* 784 * Now stick on domain information if any on end 785 */ 786 787 if (fuidp) 788 (void) zfs_log_fuid_domains(fuidp, start); 789 790 itx->itx_sync = (zp->z_sync_cnt != 0); 791 zil_itx_assign(zilog, itx, tx); 792 } 793 794 /* 795 * Handles TX_SETSAXATTR transactions. 796 */ 797 void 798 zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, 799 znode_t *zp, const char *name, const void *value, size_t size) 800 { 801 itx_t *itx; 802 lr_setsaxattr_t *lr; 803 size_t recsize = sizeof (lr_setsaxattr_t); 804 int namelen; 805 806 if (zil_replaying(zilog, tx) || zp->z_unlinked) 807 return; 808 809 namelen = strlen(name) + 1; 810 recsize += (namelen + size); 811 itx = zil_itx_create(txtype, recsize); 812 lr = (lr_setsaxattr_t *)&itx->itx_lr; 813 lr->lr_foid = zp->z_id; 814 memcpy(&lr->lr_data[0], name, namelen); 815 if (value != NULL) { 816 memcpy(&lr->lr_data[namelen], value, size); 817 lr->lr_size = size; 818 } else { 819 lr->lr_size = 0; 820 } 821 822 itx->itx_sync = (zp->z_sync_cnt != 0); 823 zil_itx_assign(zilog, itx, tx); 824 } 825 826 /* 827 * Handles TX_ACL transactions. 828 */ 829 void 830 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, 831 vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) 832 { 833 itx_t *itx; 834 lr_acl_v0_t *lrv0; 835 lr_acl_t *lr; 836 int txtype; 837 int lrsize; 838 size_t txsize; 839 size_t aclbytes = vsecp->vsa_aclentsz; 840 841 if (zil_replaying(zilog, tx) || zp->z_unlinked) 842 return; 843 844 txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ? 845 TX_ACL_V0 : TX_ACL; 846 847 if (txtype == TX_ACL) 848 lrsize = sizeof (*lr); 849 else 850 lrsize = sizeof (*lrv0); 851 852 txsize = lrsize + 853 ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + 854 (fuidp ? fuidp->z_domain_str_sz : 0) + 855 sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); 856 857 itx = zil_itx_create(txtype, txsize); 858 859 lr = (lr_acl_t *)&itx->itx_lr; 860 lr->lr_foid = zp->z_id; 861 if (txtype == TX_ACL) { 862 lr->lr_acl_bytes = aclbytes; 863 lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; 864 lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; 865 if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) 866 lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; 867 else 868 lr->lr_acl_flags = 0; 869 } 870 lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; 871 872 if (txtype == TX_ACL_V0) { 873 lrv0 = (lr_acl_v0_t *)lr; 874 memcpy(&lrv0->lr_data[0], vsecp->vsa_aclentp, aclbytes); 875 } else { 876 uint8_t *start = &lr->lr_data[0]; 877 878 memcpy(start, vsecp->vsa_aclentp, aclbytes); 879 880 start = &lr->lr_data[ZIL_ACE_LENGTH(aclbytes)]; 881 882 if (fuidp) { 883 start = zfs_log_fuid_ids(fuidp, start); 884 (void) zfs_log_fuid_domains(fuidp, start); 885 } 886 } 887 888 itx->itx_sync = (zp->z_sync_cnt != 0); 889 zil_itx_assign(zilog, itx, tx); 890 } 891 892 /* 893 * Handles TX_CLONE_RANGE transactions. 894 */ 895 void 896 zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, 897 uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, 898 size_t nbps) 899 { 900 itx_t *itx; 901 lr_clone_range_t *lr; 902 uint64_t partlen, max_log_data; 903 size_t partnbps; 904 905 if (zil_replaying(zilog, tx) || zp->z_unlinked) 906 return; 907 908 max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); 909 910 while (nbps > 0) { 911 partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); 912 partlen = partnbps * blksz; 913 ASSERT3U(partlen, <, len + blksz); 914 partlen = MIN(partlen, len); 915 916 itx = zil_itx_create(txtype, 917 sizeof (*lr) + sizeof (bps[0]) * partnbps); 918 lr = (lr_clone_range_t *)&itx->itx_lr; 919 lr->lr_foid = zp->z_id; 920 lr->lr_offset = off; 921 lr->lr_length = partlen; 922 lr->lr_blksz = blksz; 923 lr->lr_nbps = partnbps; 924 memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); 925 926 itx->itx_sync = (zp->z_sync_cnt != 0); 927 928 zil_itx_assign(zilog, itx, tx); 929 930 bps += partnbps; 931 ASSERT3U(nbps, >=, partnbps); 932 nbps -= partnbps; 933 off += partlen; 934 ASSERT3U(len, >=, partlen); 935 len -= partlen; 936 } 937 } 938 939 ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, 940 "Largest data block to write to zil"); 941