1 /* 2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it would be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 * 12 * Further, this software is distributed without any warranty that it is 13 * free of the rightful claim of any third person regarding infringement 14 * or the like. Any license provided herein, whether implied or 15 * otherwise, applies only to this software file. Patent licenses, if 16 * any, provided herein do not apply to combinations of this program with 17 * other software, or any other product whatsoever. 18 * 19 * You should have received a copy of the GNU General Public License along 20 * with this program; if not, write the Free Software Foundation, Inc., 59 21 * Temple Place - Suite 330, Boston MA 02111-1307, USA. 22 * 23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, 24 * Mountain View, CA 94043, or: 25 * 26 * http://www.sgi.com 27 * 28 * For further information regarding this notice, see: 29 * 30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ 31 */ 32 33 /* 34 * This file contains the implementation of the xfs_inode_log_item. 35 * It contains the item operations used to manipulate the inode log 36 * items as well as utility routines used by the inode specific 37 * transaction routines. 38 */ 39 #include "xfs.h" 40 #include "xfs_macros.h" 41 #include "xfs_types.h" 42 #include "xfs_inum.h" 43 #include "xfs_log.h" 44 #include "xfs_trans.h" 45 #include "xfs_buf_item.h" 46 #include "xfs_sb.h" 47 #include "xfs_dir.h" 48 #include "xfs_dir2.h" 49 #include "xfs_dmapi.h" 50 #include "xfs_mount.h" 51 #include "xfs_trans_priv.h" 52 #include "xfs_ag.h" 53 #include "xfs_alloc_btree.h" 54 #include "xfs_bmap_btree.h" 55 #include "xfs_ialloc_btree.h" 56 #include "xfs_btree.h" 57 #include "xfs_ialloc.h" 58 #include "xfs_attr_sf.h" 59 #include "xfs_dir_sf.h" 60 #include "xfs_dir2_sf.h" 61 #include "xfs_dinode.h" 62 #include "xfs_inode_item.h" 63 #include "xfs_inode.h" 64 #include "xfs_rw.h" 65 66 67 kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 68 69 /* 70 * This returns the number of iovecs needed to log the given inode item. 71 * 72 * We need one iovec for the inode log format structure, one for the 73 * inode core, and possibly one for the inode data/extents/b-tree root 74 * and one for the inode attribute data/extents/b-tree root. 75 */ 76 STATIC uint 77 xfs_inode_item_size( 78 xfs_inode_log_item_t *iip) 79 { 80 uint nvecs; 81 xfs_inode_t *ip; 82 83 ip = iip->ili_inode; 84 nvecs = 2; 85 86 /* 87 * Only log the data/extents/b-tree root if there is something 88 * left to log. 89 */ 90 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 91 92 switch (ip->i_d.di_format) { 93 case XFS_DINODE_FMT_EXTENTS: 94 iip->ili_format.ilf_fields &= 95 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 96 XFS_ILOG_DEV | XFS_ILOG_UUID); 97 if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && 98 (ip->i_d.di_nextents > 0) && 99 (ip->i_df.if_bytes > 0)) { 100 ASSERT(ip->i_df.if_u1.if_extents != NULL); 101 nvecs++; 102 } else { 103 iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; 104 } 105 break; 106 107 case XFS_DINODE_FMT_BTREE: 108 ASSERT(ip->i_df.if_ext_max == 109 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 110 iip->ili_format.ilf_fields &= 111 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 112 XFS_ILOG_DEV | XFS_ILOG_UUID); 113 if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && 114 (ip->i_df.if_broot_bytes > 0)) { 115 ASSERT(ip->i_df.if_broot != NULL); 116 nvecs++; 117 } else { 118 ASSERT(!(iip->ili_format.ilf_fields & 119 XFS_ILOG_DBROOT)); 120 #ifdef XFS_TRANS_DEBUG 121 if (iip->ili_root_size > 0) { 122 ASSERT(iip->ili_root_size == 123 ip->i_df.if_broot_bytes); 124 ASSERT(memcmp(iip->ili_orig_root, 125 ip->i_df.if_broot, 126 iip->ili_root_size) == 0); 127 } else { 128 ASSERT(ip->i_df.if_broot_bytes == 0); 129 } 130 #endif 131 iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; 132 } 133 break; 134 135 case XFS_DINODE_FMT_LOCAL: 136 iip->ili_format.ilf_fields &= 137 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 138 XFS_ILOG_DEV | XFS_ILOG_UUID); 139 if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && 140 (ip->i_df.if_bytes > 0)) { 141 ASSERT(ip->i_df.if_u1.if_data != NULL); 142 ASSERT(ip->i_d.di_size > 0); 143 nvecs++; 144 } else { 145 iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; 146 } 147 break; 148 149 case XFS_DINODE_FMT_DEV: 150 iip->ili_format.ilf_fields &= 151 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 152 XFS_ILOG_DEXT | XFS_ILOG_UUID); 153 break; 154 155 case XFS_DINODE_FMT_UUID: 156 iip->ili_format.ilf_fields &= 157 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 158 XFS_ILOG_DEXT | XFS_ILOG_DEV); 159 break; 160 161 default: 162 ASSERT(0); 163 break; 164 } 165 166 /* 167 * If there are no attributes associated with this file, 168 * then there cannot be anything more to log. 169 * Clear all attribute-related log flags. 170 */ 171 if (!XFS_IFORK_Q(ip)) { 172 iip->ili_format.ilf_fields &= 173 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); 174 return nvecs; 175 } 176 177 /* 178 * Log any necessary attribute data. 179 */ 180 switch (ip->i_d.di_aformat) { 181 case XFS_DINODE_FMT_EXTENTS: 182 iip->ili_format.ilf_fields &= 183 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); 184 if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && 185 (ip->i_d.di_anextents > 0) && 186 (ip->i_afp->if_bytes > 0)) { 187 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 188 nvecs++; 189 } else { 190 iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; 191 } 192 break; 193 194 case XFS_DINODE_FMT_BTREE: 195 iip->ili_format.ilf_fields &= 196 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 197 if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && 198 (ip->i_afp->if_broot_bytes > 0)) { 199 ASSERT(ip->i_afp->if_broot != NULL); 200 nvecs++; 201 } else { 202 iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; 203 } 204 break; 205 206 case XFS_DINODE_FMT_LOCAL: 207 iip->ili_format.ilf_fields &= 208 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 209 if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && 210 (ip->i_afp->if_bytes > 0)) { 211 ASSERT(ip->i_afp->if_u1.if_data != NULL); 212 nvecs++; 213 } else { 214 iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; 215 } 216 break; 217 218 default: 219 ASSERT(0); 220 break; 221 } 222 223 return nvecs; 224 } 225 226 /* 227 * This is called to fill in the vector of log iovecs for the 228 * given inode log item. It fills the first item with an inode 229 * log format structure, the second with the on-disk inode structure, 230 * and a possible third and/or fourth with the inode data/extents/b-tree 231 * root and inode attributes data/extents/b-tree root. 232 */ 233 STATIC void 234 xfs_inode_item_format( 235 xfs_inode_log_item_t *iip, 236 xfs_log_iovec_t *log_vector) 237 { 238 uint nvecs; 239 xfs_log_iovec_t *vecp; 240 xfs_inode_t *ip; 241 size_t data_bytes; 242 xfs_bmbt_rec_t *ext_buffer; 243 int nrecs; 244 xfs_mount_t *mp; 245 246 ip = iip->ili_inode; 247 vecp = log_vector; 248 249 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 250 vecp->i_len = sizeof(xfs_inode_log_format_t); 251 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); 252 vecp++; 253 nvecs = 1; 254 255 /* 256 * Clear i_update_core if the timestamps (or any other 257 * non-transactional modification) need flushing/logging 258 * and we're about to log them with the rest of the core. 259 * 260 * This is the same logic as xfs_iflush() but this code can't 261 * run at the same time as xfs_iflush because we're in commit 262 * processing here and so we have the inode lock held in 263 * exclusive mode. Although it doesn't really matter 264 * for the timestamps if both routines were to grab the 265 * timestamps or not. That would be ok. 266 * 267 * We clear i_update_core before copying out the data. 268 * This is for coordination with our timestamp updates 269 * that don't hold the inode lock. They will always 270 * update the timestamps BEFORE setting i_update_core, 271 * so if we clear i_update_core after they set it we 272 * are guaranteed to see their updates to the timestamps 273 * either here. Likewise, if they set it after we clear it 274 * here, we'll see it either on the next commit of this 275 * inode or the next time the inode gets flushed via 276 * xfs_iflush(). This depends on strongly ordered memory 277 * semantics, but we have that. We use the SYNCHRONIZE 278 * macro to make sure that the compiler does not reorder 279 * the i_update_core access below the data copy below. 280 */ 281 if (ip->i_update_core) { 282 ip->i_update_core = 0; 283 SYNCHRONIZE(); 284 } 285 286 /* 287 * We don't have to worry about re-ordering here because 288 * the update_size field is protected by the inode lock 289 * and we have that held in exclusive mode. 290 */ 291 if (ip->i_update_size) 292 ip->i_update_size = 0; 293 294 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 295 vecp->i_len = sizeof(xfs_dinode_core_t); 296 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 297 vecp++; 298 nvecs++; 299 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 300 301 /* 302 * If this is really an old format inode, then we need to 303 * log it as such. This means that we have to copy the link 304 * count from the new field to the old. We don't have to worry 305 * about the new fields, because nothing trusts them as long as 306 * the old inode version number is there. If the superblock already 307 * has a new version number, then we don't bother converting back. 308 */ 309 mp = ip->i_mount; 310 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 311 XFS_SB_VERSION_HASNLINK(&mp->m_sb)); 312 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 313 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { 314 /* 315 * Convert it back. 316 */ 317 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 318 ip->i_d.di_onlink = ip->i_d.di_nlink; 319 } else { 320 /* 321 * The superblock version has already been bumped, 322 * so just make the conversion to the new inode 323 * format permanent. 324 */ 325 ip->i_d.di_version = XFS_DINODE_VERSION_2; 326 ip->i_d.di_onlink = 0; 327 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 328 } 329 } 330 331 switch (ip->i_d.di_format) { 332 case XFS_DINODE_FMT_EXTENTS: 333 ASSERT(!(iip->ili_format.ilf_fields & 334 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 335 XFS_ILOG_DEV | XFS_ILOG_UUID))); 336 if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { 337 ASSERT(ip->i_df.if_bytes > 0); 338 ASSERT(ip->i_df.if_u1.if_extents != NULL); 339 ASSERT(ip->i_d.di_nextents > 0); 340 ASSERT(iip->ili_extents_buf == NULL); 341 nrecs = ip->i_df.if_bytes / 342 (uint)sizeof(xfs_bmbt_rec_t); 343 ASSERT(nrecs > 0); 344 #ifdef XFS_NATIVE_HOST 345 if (nrecs == ip->i_d.di_nextents) { 346 /* 347 * There are no delayed allocation 348 * extents, so just point to the 349 * real extents array. 350 */ 351 vecp->i_addr = 352 (char *)(ip->i_df.if_u1.if_extents); 353 vecp->i_len = ip->i_df.if_bytes; 354 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 355 } else 356 #endif 357 { 358 /* 359 * There are delayed allocation extents 360 * in the inode, or we need to convert 361 * the extents to on disk format. 362 * Use xfs_iextents_copy() 363 * to copy only the real extents into 364 * a separate buffer. We'll free the 365 * buffer in the unlock routine. 366 */ 367 ext_buffer = kmem_alloc(ip->i_df.if_bytes, 368 KM_SLEEP); 369 iip->ili_extents_buf = ext_buffer; 370 vecp->i_addr = (xfs_caddr_t)ext_buffer; 371 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 372 XFS_DATA_FORK); 373 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 374 } 375 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 376 iip->ili_format.ilf_dsize = vecp->i_len; 377 vecp++; 378 nvecs++; 379 } 380 break; 381 382 case XFS_DINODE_FMT_BTREE: 383 ASSERT(!(iip->ili_format.ilf_fields & 384 (XFS_ILOG_DDATA | XFS_ILOG_DEXT | 385 XFS_ILOG_DEV | XFS_ILOG_UUID))); 386 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { 387 ASSERT(ip->i_df.if_broot_bytes > 0); 388 ASSERT(ip->i_df.if_broot != NULL); 389 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 390 vecp->i_len = ip->i_df.if_broot_bytes; 391 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); 392 vecp++; 393 nvecs++; 394 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 395 } 396 break; 397 398 case XFS_DINODE_FMT_LOCAL: 399 ASSERT(!(iip->ili_format.ilf_fields & 400 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 401 XFS_ILOG_DEV | XFS_ILOG_UUID))); 402 if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { 403 ASSERT(ip->i_df.if_bytes > 0); 404 ASSERT(ip->i_df.if_u1.if_data != NULL); 405 ASSERT(ip->i_d.di_size > 0); 406 407 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; 408 /* 409 * Round i_bytes up to a word boundary. 410 * The underlying memory is guaranteed to 411 * to be there by xfs_idata_realloc(). 412 */ 413 data_bytes = roundup(ip->i_df.if_bytes, 4); 414 ASSERT((ip->i_df.if_real_bytes == 0) || 415 (ip->i_df.if_real_bytes == data_bytes)); 416 vecp->i_len = (int)data_bytes; 417 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); 418 vecp++; 419 nvecs++; 420 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 421 } 422 break; 423 424 case XFS_DINODE_FMT_DEV: 425 ASSERT(!(iip->ili_format.ilf_fields & 426 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 427 XFS_ILOG_DDATA | XFS_ILOG_UUID))); 428 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 429 iip->ili_format.ilf_u.ilfu_rdev = 430 ip->i_df.if_u2.if_rdev; 431 } 432 break; 433 434 case XFS_DINODE_FMT_UUID: 435 ASSERT(!(iip->ili_format.ilf_fields & 436 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 437 XFS_ILOG_DDATA | XFS_ILOG_DEV))); 438 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 439 iip->ili_format.ilf_u.ilfu_uuid = 440 ip->i_df.if_u2.if_uuid; 441 } 442 break; 443 444 default: 445 ASSERT(0); 446 break; 447 } 448 449 /* 450 * If there are no attributes associated with the file, 451 * then we're done. 452 * Assert that no attribute-related log flags are set. 453 */ 454 if (!XFS_IFORK_Q(ip)) { 455 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 456 iip->ili_format.ilf_size = nvecs; 457 ASSERT(!(iip->ili_format.ilf_fields & 458 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 459 return; 460 } 461 462 switch (ip->i_d.di_aformat) { 463 case XFS_DINODE_FMT_EXTENTS: 464 ASSERT(!(iip->ili_format.ilf_fields & 465 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); 466 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { 467 ASSERT(ip->i_afp->if_bytes > 0); 468 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 469 ASSERT(ip->i_d.di_anextents > 0); 470 #ifdef DEBUG 471 nrecs = ip->i_afp->if_bytes / 472 (uint)sizeof(xfs_bmbt_rec_t); 473 #endif 474 ASSERT(nrecs > 0); 475 ASSERT(nrecs == ip->i_d.di_anextents); 476 #ifdef XFS_NATIVE_HOST 477 /* 478 * There are not delayed allocation extents 479 * for attributes, so just point at the array. 480 */ 481 vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); 482 vecp->i_len = ip->i_afp->if_bytes; 483 #else 484 ASSERT(iip->ili_aextents_buf == NULL); 485 /* 486 * Need to endian flip before logging 487 */ 488 ext_buffer = kmem_alloc(ip->i_afp->if_bytes, 489 KM_SLEEP); 490 iip->ili_aextents_buf = ext_buffer; 491 vecp->i_addr = (xfs_caddr_t)ext_buffer; 492 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 493 XFS_ATTR_FORK); 494 #endif 495 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); 496 iip->ili_format.ilf_asize = vecp->i_len; 497 vecp++; 498 nvecs++; 499 } 500 break; 501 502 case XFS_DINODE_FMT_BTREE: 503 ASSERT(!(iip->ili_format.ilf_fields & 504 (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); 505 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { 506 ASSERT(ip->i_afp->if_broot_bytes > 0); 507 ASSERT(ip->i_afp->if_broot != NULL); 508 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 509 vecp->i_len = ip->i_afp->if_broot_bytes; 510 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); 511 vecp++; 512 nvecs++; 513 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 514 } 515 break; 516 517 case XFS_DINODE_FMT_LOCAL: 518 ASSERT(!(iip->ili_format.ilf_fields & 519 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 520 if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { 521 ASSERT(ip->i_afp->if_bytes > 0); 522 ASSERT(ip->i_afp->if_u1.if_data != NULL); 523 524 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; 525 /* 526 * Round i_bytes up to a word boundary. 527 * The underlying memory is guaranteed to 528 * to be there by xfs_idata_realloc(). 529 */ 530 data_bytes = roundup(ip->i_afp->if_bytes, 4); 531 ASSERT((ip->i_afp->if_real_bytes == 0) || 532 (ip->i_afp->if_real_bytes == data_bytes)); 533 vecp->i_len = (int)data_bytes; 534 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); 535 vecp++; 536 nvecs++; 537 iip->ili_format.ilf_asize = (unsigned)data_bytes; 538 } 539 break; 540 541 default: 542 ASSERT(0); 543 break; 544 } 545 546 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 547 iip->ili_format.ilf_size = nvecs; 548 } 549 550 551 /* 552 * This is called to pin the inode associated with the inode log 553 * item in memory so it cannot be written out. Do this by calling 554 * xfs_ipin() to bump the pin count in the inode while holding the 555 * inode pin lock. 556 */ 557 STATIC void 558 xfs_inode_item_pin( 559 xfs_inode_log_item_t *iip) 560 { 561 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); 562 xfs_ipin(iip->ili_inode); 563 } 564 565 566 /* 567 * This is called to unpin the inode associated with the inode log 568 * item which was previously pinned with a call to xfs_inode_item_pin(). 569 * Just call xfs_iunpin() on the inode to do this. 570 */ 571 /* ARGSUSED */ 572 STATIC void 573 xfs_inode_item_unpin( 574 xfs_inode_log_item_t *iip, 575 int stale) 576 { 577 xfs_iunpin(iip->ili_inode); 578 } 579 580 /* ARGSUSED */ 581 STATIC void 582 xfs_inode_item_unpin_remove( 583 xfs_inode_log_item_t *iip, 584 xfs_trans_t *tp) 585 { 586 xfs_iunpin(iip->ili_inode); 587 } 588 589 /* 590 * This is called to attempt to lock the inode associated with this 591 * inode log item, in preparation for the push routine which does the actual 592 * iflush. Don't sleep on the inode lock or the flush lock. 593 * 594 * If the flush lock is already held, indicating that the inode has 595 * been or is in the process of being flushed, then (ideally) we'd like to 596 * see if the inode's buffer is still incore, and if so give it a nudge. 597 * We delay doing so until the pushbuf routine, though, to avoid holding 598 * the AIL lock across a call to the blackhole which is the buffercache. 599 * Also we don't want to sleep in any device strategy routines, which can happen 600 * if we do the subsequent bawrite in here. 601 */ 602 STATIC uint 603 xfs_inode_item_trylock( 604 xfs_inode_log_item_t *iip) 605 { 606 register xfs_inode_t *ip; 607 608 ip = iip->ili_inode; 609 610 if (xfs_ipincount(ip) > 0) { 611 return XFS_ITEM_PINNED; 612 } 613 614 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 615 return XFS_ITEM_LOCKED; 616 } 617 618 if (!xfs_iflock_nowait(ip)) { 619 /* 620 * If someone else isn't already trying to push the inode 621 * buffer, we get to do it. 622 */ 623 if (iip->ili_pushbuf_flag == 0) { 624 iip->ili_pushbuf_flag = 1; 625 #ifdef DEBUG 626 iip->ili_push_owner = get_thread_id(); 627 #endif 628 /* 629 * Inode is left locked in shared mode. 630 * Pushbuf routine gets to unlock it. 631 */ 632 return XFS_ITEM_PUSHBUF; 633 } else { 634 /* 635 * We hold the AIL_LOCK, so we must specify the 636 * NONOTIFY flag so that we won't double trip. 637 */ 638 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 639 return XFS_ITEM_FLUSHING; 640 } 641 /* NOTREACHED */ 642 } 643 644 /* Stale items should force out the iclog */ 645 if (ip->i_flags & XFS_ISTALE) { 646 xfs_ifunlock(ip); 647 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 648 return XFS_ITEM_PINNED; 649 } 650 651 #ifdef DEBUG 652 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 653 ASSERT(iip->ili_format.ilf_fields != 0); 654 ASSERT(iip->ili_logged == 0); 655 ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); 656 } 657 #endif 658 return XFS_ITEM_SUCCESS; 659 } 660 661 /* 662 * Unlock the inode associated with the inode log item. 663 * Clear the fields of the inode and inode log item that 664 * are specific to the current transaction. If the 665 * hold flags is set, do not unlock the inode. 666 */ 667 STATIC void 668 xfs_inode_item_unlock( 669 xfs_inode_log_item_t *iip) 670 { 671 uint hold; 672 uint iolocked; 673 uint lock_flags; 674 xfs_inode_t *ip; 675 676 ASSERT(iip != NULL); 677 ASSERT(iip->ili_inode->i_itemp != NULL); 678 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); 679 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 680 XFS_ILI_IOLOCKED_EXCL)) || 681 ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); 682 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 683 XFS_ILI_IOLOCKED_SHARED)) || 684 ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); 685 /* 686 * Clear the transaction pointer in the inode. 687 */ 688 ip = iip->ili_inode; 689 ip->i_transp = NULL; 690 691 /* 692 * If the inode needed a separate buffer with which to log 693 * its extents, then free it now. 694 */ 695 if (iip->ili_extents_buf != NULL) { 696 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); 697 ASSERT(ip->i_d.di_nextents > 0); 698 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); 699 ASSERT(ip->i_df.if_bytes > 0); 700 kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); 701 iip->ili_extents_buf = NULL; 702 } 703 if (iip->ili_aextents_buf != NULL) { 704 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); 705 ASSERT(ip->i_d.di_anextents > 0); 706 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); 707 ASSERT(ip->i_afp->if_bytes > 0); 708 kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); 709 iip->ili_aextents_buf = NULL; 710 } 711 712 /* 713 * Figure out if we should unlock the inode or not. 714 */ 715 hold = iip->ili_flags & XFS_ILI_HOLD; 716 717 /* 718 * Before clearing out the flags, remember whether we 719 * are holding the inode's IO lock. 720 */ 721 iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; 722 723 /* 724 * Clear out the fields of the inode log item particular 725 * to the current transaction. 726 */ 727 iip->ili_ilock_recur = 0; 728 iip->ili_iolock_recur = 0; 729 iip->ili_flags = 0; 730 731 /* 732 * Unlock the inode if XFS_ILI_HOLD was not set. 733 */ 734 if (!hold) { 735 lock_flags = XFS_ILOCK_EXCL; 736 if (iolocked & XFS_ILI_IOLOCKED_EXCL) { 737 lock_flags |= XFS_IOLOCK_EXCL; 738 } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { 739 lock_flags |= XFS_IOLOCK_SHARED; 740 } 741 xfs_iput(iip->ili_inode, lock_flags); 742 } 743 } 744 745 /* 746 * This is called to find out where the oldest active copy of the 747 * inode log item in the on disk log resides now that the last log 748 * write of it completed at the given lsn. Since we always re-log 749 * all dirty data in an inode, the latest copy in the on disk log 750 * is the only one that matters. Therefore, simply return the 751 * given lsn. 752 */ 753 /*ARGSUSED*/ 754 STATIC xfs_lsn_t 755 xfs_inode_item_committed( 756 xfs_inode_log_item_t *iip, 757 xfs_lsn_t lsn) 758 { 759 return (lsn); 760 } 761 762 /* 763 * The transaction with the inode locked has aborted. The inode 764 * must not be dirty within the transaction (unless we're forcibly 765 * shutting down). We simply unlock just as if the transaction 766 * had been cancelled. 767 */ 768 STATIC void 769 xfs_inode_item_abort( 770 xfs_inode_log_item_t *iip) 771 { 772 xfs_inode_item_unlock(iip); 773 return; 774 } 775 776 777 /* 778 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 779 * failed to get the inode flush lock but did get the inode locked SHARED. 780 * Here we're trying to see if the inode buffer is incore, and if so whether it's 781 * marked delayed write. If that's the case, we'll initiate a bawrite on that 782 * buffer to expedite the process. 783 * 784 * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, 785 * so it is inherently race-y. 786 */ 787 STATIC void 788 xfs_inode_item_pushbuf( 789 xfs_inode_log_item_t *iip) 790 { 791 xfs_inode_t *ip; 792 xfs_mount_t *mp; 793 xfs_buf_t *bp; 794 uint dopush; 795 796 ip = iip->ili_inode; 797 798 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); 799 800 /* 801 * The ili_pushbuf_flag keeps others from 802 * trying to duplicate our effort. 803 */ 804 ASSERT(iip->ili_pushbuf_flag != 0); 805 ASSERT(iip->ili_push_owner == get_thread_id()); 806 807 /* 808 * If flushlock isn't locked anymore, chances are that the 809 * inode flush completed and the inode was taken off the AIL. 810 * So, just get out. 811 */ 812 if ((valusema(&(ip->i_flock)) > 0) || 813 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 814 iip->ili_pushbuf_flag = 0; 815 xfs_iunlock(ip, XFS_ILOCK_SHARED); 816 return; 817 } 818 819 mp = ip->i_mount; 820 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 821 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 822 823 if (bp != NULL) { 824 if (XFS_BUF_ISDELAYWRITE(bp)) { 825 /* 826 * We were racing with iflush because we don't hold 827 * the AIL_LOCK or the flush lock. However, at this point, 828 * we have the buffer, and we know that it's dirty. 829 * So, it's possible that iflush raced with us, and 830 * this item is already taken off the AIL. 831 * If not, we can flush it async. 832 */ 833 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && 834 (valusema(&(ip->i_flock)) <= 0)); 835 iip->ili_pushbuf_flag = 0; 836 xfs_iunlock(ip, XFS_ILOCK_SHARED); 837 xfs_buftrace("INODE ITEM PUSH", bp); 838 if (XFS_BUF_ISPINNED(bp)) { 839 xfs_log_force(mp, (xfs_lsn_t)0, 840 XFS_LOG_FORCE); 841 } 842 if (dopush) { 843 xfs_bawrite(mp, bp); 844 } else { 845 xfs_buf_relse(bp); 846 } 847 } else { 848 iip->ili_pushbuf_flag = 0; 849 xfs_iunlock(ip, XFS_ILOCK_SHARED); 850 xfs_buf_relse(bp); 851 } 852 return; 853 } 854 /* 855 * We have to be careful about resetting pushbuf flag too early (above). 856 * Even though in theory we can do it as soon as we have the buflock, 857 * we don't want others to be doing work needlessly. They'll come to 858 * this function thinking that pushing the buffer is their 859 * responsibility only to find that the buffer is still locked by 860 * another doing the same thing 861 */ 862 iip->ili_pushbuf_flag = 0; 863 xfs_iunlock(ip, XFS_ILOCK_SHARED); 864 return; 865 } 866 867 868 /* 869 * This is called to asynchronously write the inode associated with this 870 * inode log item out to disk. The inode will already have been locked by 871 * a successful call to xfs_inode_item_trylock(). 872 */ 873 STATIC void 874 xfs_inode_item_push( 875 xfs_inode_log_item_t *iip) 876 { 877 xfs_inode_t *ip; 878 879 ip = iip->ili_inode; 880 881 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); 882 ASSERT(valusema(&(ip->i_flock)) <= 0); 883 /* 884 * Since we were able to lock the inode's flush lock and 885 * we found it on the AIL, the inode must be dirty. This 886 * is because the inode is removed from the AIL while still 887 * holding the flush lock in xfs_iflush_done(). Thus, if 888 * we found it in the AIL and were able to obtain the flush 889 * lock without sleeping, then there must not have been 890 * anyone in the process of flushing the inode. 891 */ 892 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || 893 iip->ili_format.ilf_fields != 0); 894 895 /* 896 * Write out the inode. The completion routine ('iflush_done') will 897 * pull it from the AIL, mark it clean, unlock the flush lock. 898 */ 899 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 900 xfs_iunlock(ip, XFS_ILOCK_SHARED); 901 902 return; 903 } 904 905 /* 906 * XXX rcc - this one really has to do something. Probably needs 907 * to stamp in a new field in the incore inode. 908 */ 909 /* ARGSUSED */ 910 STATIC void 911 xfs_inode_item_committing( 912 xfs_inode_log_item_t *iip, 913 xfs_lsn_t lsn) 914 { 915 iip->ili_last_lsn = lsn; 916 return; 917 } 918 919 /* 920 * This is the ops vector shared by all buf log items. 921 */ 922 STATIC struct xfs_item_ops xfs_inode_item_ops = { 923 .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, 924 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 925 xfs_inode_item_format, 926 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 927 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 928 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 929 xfs_inode_item_unpin_remove, 930 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 931 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, 932 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 933 xfs_inode_item_committed, 934 .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, 935 .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, 936 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, 937 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 938 xfs_inode_item_committing 939 }; 940 941 942 /* 943 * Initialize the inode log item for a newly allocated (in-core) inode. 944 */ 945 void 946 xfs_inode_item_init( 947 xfs_inode_t *ip, 948 xfs_mount_t *mp) 949 { 950 xfs_inode_log_item_t *iip; 951 952 ASSERT(ip->i_itemp == NULL); 953 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 954 955 iip->ili_item.li_type = XFS_LI_INODE; 956 iip->ili_item.li_ops = &xfs_inode_item_ops; 957 iip->ili_item.li_mountp = mp; 958 iip->ili_inode = ip; 959 960 /* 961 We have zeroed memory. No need ... 962 iip->ili_extents_buf = NULL; 963 iip->ili_pushbuf_flag = 0; 964 */ 965 966 iip->ili_format.ilf_type = XFS_LI_INODE; 967 iip->ili_format.ilf_ino = ip->i_ino; 968 iip->ili_format.ilf_blkno = ip->i_blkno; 969 iip->ili_format.ilf_len = ip->i_len; 970 iip->ili_format.ilf_boffset = ip->i_boffset; 971 } 972 973 /* 974 * Free the inode log item and any memory hanging off of it. 975 */ 976 void 977 xfs_inode_item_destroy( 978 xfs_inode_t *ip) 979 { 980 #ifdef XFS_TRANS_DEBUG 981 if (ip->i_itemp->ili_root_size != 0) { 982 kmem_free(ip->i_itemp->ili_orig_root, 983 ip->i_itemp->ili_root_size); 984 } 985 #endif 986 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 987 } 988 989 990 /* 991 * This is the inode flushing I/O completion routine. It is called 992 * from interrupt level when the buffer containing the inode is 993 * flushed to disk. It is responsible for removing the inode item 994 * from the AIL if it has not been re-logged, and unlocking the inode's 995 * flush lock. 996 */ 997 /*ARGSUSED*/ 998 void 999 xfs_iflush_done( 1000 xfs_buf_t *bp, 1001 xfs_inode_log_item_t *iip) 1002 { 1003 xfs_inode_t *ip; 1004 SPLDECL(s); 1005 1006 ip = iip->ili_inode; 1007 1008 /* 1009 * We only want to pull the item from the AIL if it is 1010 * actually there and its location in the log has not 1011 * changed since we started the flush. Thus, we only bother 1012 * if the ili_logged flag is set and the inode's lsn has not 1013 * changed. First we check the lsn outside 1014 * the lock since it's cheaper, and then we recheck while 1015 * holding the lock before removing the inode from the AIL. 1016 */ 1017 if (iip->ili_logged && 1018 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { 1019 AIL_LOCK(ip->i_mount, s); 1020 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { 1021 /* 1022 * xfs_trans_delete_ail() drops the AIL lock. 1023 */ 1024 xfs_trans_delete_ail(ip->i_mount, 1025 (xfs_log_item_t*)iip, s); 1026 } else { 1027 AIL_UNLOCK(ip->i_mount, s); 1028 } 1029 } 1030 1031 iip->ili_logged = 0; 1032 1033 /* 1034 * Clear the ili_last_fields bits now that we know that the 1035 * data corresponding to them is safely on disk. 1036 */ 1037 iip->ili_last_fields = 0; 1038 1039 /* 1040 * Release the inode's flush lock since we're done with it. 1041 */ 1042 xfs_ifunlock(ip); 1043 1044 return; 1045 } 1046 1047 /* 1048 * This is the inode flushing abort routine. It is called 1049 * from xfs_iflush when the filesystem is shutting down to clean 1050 * up the inode state. 1051 * It is responsible for removing the inode item 1052 * from the AIL if it has not been re-logged, and unlocking the inode's 1053 * flush lock. 1054 */ 1055 void 1056 xfs_iflush_abort( 1057 xfs_inode_t *ip) 1058 { 1059 xfs_inode_log_item_t *iip; 1060 xfs_mount_t *mp; 1061 SPLDECL(s); 1062 1063 iip = ip->i_itemp; 1064 mp = ip->i_mount; 1065 if (iip) { 1066 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1067 AIL_LOCK(mp, s); 1068 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1069 /* 1070 * xfs_trans_delete_ail() drops the AIL lock. 1071 */ 1072 xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip, 1073 s); 1074 } else 1075 AIL_UNLOCK(mp, s); 1076 } 1077 iip->ili_logged = 0; 1078 /* 1079 * Clear the ili_last_fields bits now that we know that the 1080 * data corresponding to them is safely on disk. 1081 */ 1082 iip->ili_last_fields = 0; 1083 /* 1084 * Clear the inode logging fields so no more flushes are 1085 * attempted. 1086 */ 1087 iip->ili_format.ilf_fields = 0; 1088 } 1089 /* 1090 * Release the inode's flush lock since we're done with it. 1091 */ 1092 xfs_ifunlock(ip); 1093 } 1094 1095 void 1096 xfs_istale_done( 1097 xfs_buf_t *bp, 1098 xfs_inode_log_item_t *iip) 1099 { 1100 xfs_iflush_abort(iip->ili_inode); 1101 } 1102