1 /* 2 * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it would be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 * 12 * Further, this software is distributed without any warranty that it is 13 * free of the rightful claim of any third person regarding infringement 14 * or the like. Any license provided herein, whether implied or 15 * otherwise, applies only to this software file. Patent licenses, if 16 * any, provided herein do not apply to combinations of this program with 17 * other software, or any other product whatsoever. 18 * 19 * You should have received a copy of the GNU General Public License along 20 * with this program; if not, write the Free Software Foundation, Inc., 59 21 * Temple Place - Suite 330, Boston MA 02111-1307, USA. 22 * 23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, 24 * Mountain View, CA 94043, or: 25 * 26 * http://www.sgi.com 27 * 28 * For further information regarding this notice, see: 29 * 30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ 31 */ 32 33 /* 34 * This file contains the implementation of the xfs_inode_log_item. 35 * It contains the item operations used to manipulate the inode log 36 * items as well as utility routines used by the inode specific 37 * transaction routines. 38 */ 39 #include "xfs.h" 40 #include "xfs_macros.h" 41 #include "xfs_types.h" 42 #include "xfs_inum.h" 43 #include "xfs_log.h" 44 #include "xfs_trans.h" 45 #include "xfs_buf_item.h" 46 #include "xfs_sb.h" 47 #include "xfs_dir.h" 48 #include "xfs_dir2.h" 49 #include "xfs_dmapi.h" 50 #include "xfs_mount.h" 51 #include "xfs_trans_priv.h" 52 #include "xfs_ag.h" 53 #include "xfs_alloc_btree.h" 54 #include "xfs_bmap_btree.h" 55 #include "xfs_ialloc_btree.h" 56 #include "xfs_btree.h" 57 #include "xfs_ialloc.h" 58 #include "xfs_attr_sf.h" 59 #include "xfs_dir_sf.h" 60 #include "xfs_dir2_sf.h" 61 #include "xfs_dinode.h" 62 #include "xfs_inode_item.h" 63 #include "xfs_inode.h" 64 #include "xfs_rw.h" 65 66 67 kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 68 69 /* 70 * This returns the number of iovecs needed to log the given inode item. 71 * 72 * We need one iovec for the inode log format structure, one for the 73 * inode core, and possibly one for the inode data/extents/b-tree root 74 * and one for the inode attribute data/extents/b-tree root. 75 */ 76 STATIC uint 77 xfs_inode_item_size( 78 xfs_inode_log_item_t *iip) 79 { 80 uint nvecs; 81 xfs_inode_t *ip; 82 83 ip = iip->ili_inode; 84 nvecs = 2; 85 86 /* 87 * Only log the data/extents/b-tree root if there is something 88 * left to log. 89 */ 90 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 91 92 switch (ip->i_d.di_format) { 93 case XFS_DINODE_FMT_EXTENTS: 94 iip->ili_format.ilf_fields &= 95 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 96 XFS_ILOG_DEV | XFS_ILOG_UUID); 97 if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && 98 (ip->i_d.di_nextents > 0) && 99 (ip->i_df.if_bytes > 0)) { 100 ASSERT(ip->i_df.if_u1.if_extents != NULL); 101 nvecs++; 102 } else { 103 iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; 104 } 105 break; 106 107 case XFS_DINODE_FMT_BTREE: 108 ASSERT(ip->i_df.if_ext_max == 109 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); 110 iip->ili_format.ilf_fields &= 111 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 112 XFS_ILOG_DEV | XFS_ILOG_UUID); 113 if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && 114 (ip->i_df.if_broot_bytes > 0)) { 115 ASSERT(ip->i_df.if_broot != NULL); 116 nvecs++; 117 } else { 118 ASSERT(!(iip->ili_format.ilf_fields & 119 XFS_ILOG_DBROOT)); 120 #ifdef XFS_TRANS_DEBUG 121 if (iip->ili_root_size > 0) { 122 ASSERT(iip->ili_root_size == 123 ip->i_df.if_broot_bytes); 124 ASSERT(memcmp(iip->ili_orig_root, 125 ip->i_df.if_broot, 126 iip->ili_root_size) == 0); 127 } else { 128 ASSERT(ip->i_df.if_broot_bytes == 0); 129 } 130 #endif 131 iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; 132 } 133 break; 134 135 case XFS_DINODE_FMT_LOCAL: 136 iip->ili_format.ilf_fields &= 137 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 138 XFS_ILOG_DEV | XFS_ILOG_UUID); 139 if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && 140 (ip->i_df.if_bytes > 0)) { 141 ASSERT(ip->i_df.if_u1.if_data != NULL); 142 ASSERT(ip->i_d.di_size > 0); 143 nvecs++; 144 } else { 145 iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; 146 } 147 break; 148 149 case XFS_DINODE_FMT_DEV: 150 iip->ili_format.ilf_fields &= 151 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 152 XFS_ILOG_DEXT | XFS_ILOG_UUID); 153 break; 154 155 case XFS_DINODE_FMT_UUID: 156 iip->ili_format.ilf_fields &= 157 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 158 XFS_ILOG_DEXT | XFS_ILOG_DEV); 159 break; 160 161 default: 162 ASSERT(0); 163 break; 164 } 165 166 /* 167 * If there are no attributes associated with this file, 168 * then there cannot be anything more to log. 169 * Clear all attribute-related log flags. 170 */ 171 if (!XFS_IFORK_Q(ip)) { 172 iip->ili_format.ilf_fields &= 173 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); 174 return nvecs; 175 } 176 177 /* 178 * Log any necessary attribute data. 179 */ 180 switch (ip->i_d.di_aformat) { 181 case XFS_DINODE_FMT_EXTENTS: 182 iip->ili_format.ilf_fields &= 183 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); 184 if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && 185 (ip->i_d.di_anextents > 0) && 186 (ip->i_afp->if_bytes > 0)) { 187 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 188 nvecs++; 189 } else { 190 iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; 191 } 192 break; 193 194 case XFS_DINODE_FMT_BTREE: 195 iip->ili_format.ilf_fields &= 196 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 197 if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && 198 (ip->i_afp->if_broot_bytes > 0)) { 199 ASSERT(ip->i_afp->if_broot != NULL); 200 nvecs++; 201 } else { 202 iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; 203 } 204 break; 205 206 case XFS_DINODE_FMT_LOCAL: 207 iip->ili_format.ilf_fields &= 208 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 209 if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && 210 (ip->i_afp->if_bytes > 0)) { 211 ASSERT(ip->i_afp->if_u1.if_data != NULL); 212 nvecs++; 213 } else { 214 iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; 215 } 216 break; 217 218 default: 219 ASSERT(0); 220 break; 221 } 222 223 return nvecs; 224 } 225 226 /* 227 * This is called to fill in the vector of log iovecs for the 228 * given inode log item. It fills the first item with an inode 229 * log format structure, the second with the on-disk inode structure, 230 * and a possible third and/or fourth with the inode data/extents/b-tree 231 * root and inode attributes data/extents/b-tree root. 232 */ 233 STATIC void 234 xfs_inode_item_format( 235 xfs_inode_log_item_t *iip, 236 xfs_log_iovec_t *log_vector) 237 { 238 uint nvecs; 239 xfs_log_iovec_t *vecp; 240 xfs_inode_t *ip; 241 size_t data_bytes; 242 xfs_bmbt_rec_t *ext_buffer; 243 int nrecs; 244 xfs_mount_t *mp; 245 246 ip = iip->ili_inode; 247 vecp = log_vector; 248 249 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 250 vecp->i_len = sizeof(xfs_inode_log_format_t); 251 vecp++; 252 nvecs = 1; 253 254 /* 255 * Clear i_update_core if the timestamps (or any other 256 * non-transactional modification) need flushing/logging 257 * and we're about to log them with the rest of the core. 258 * 259 * This is the same logic as xfs_iflush() but this code can't 260 * run at the same time as xfs_iflush because we're in commit 261 * processing here and so we have the inode lock held in 262 * exclusive mode. Although it doesn't really matter 263 * for the timestamps if both routines were to grab the 264 * timestamps or not. That would be ok. 265 * 266 * We clear i_update_core before copying out the data. 267 * This is for coordination with our timestamp updates 268 * that don't hold the inode lock. They will always 269 * update the timestamps BEFORE setting i_update_core, 270 * so if we clear i_update_core after they set it we 271 * are guaranteed to see their updates to the timestamps 272 * either here. Likewise, if they set it after we clear it 273 * here, we'll see it either on the next commit of this 274 * inode or the next time the inode gets flushed via 275 * xfs_iflush(). This depends on strongly ordered memory 276 * semantics, but we have that. We use the SYNCHRONIZE 277 * macro to make sure that the compiler does not reorder 278 * the i_update_core access below the data copy below. 279 */ 280 if (ip->i_update_core) { 281 ip->i_update_core = 0; 282 SYNCHRONIZE(); 283 } 284 285 /* 286 * We don't have to worry about re-ordering here because 287 * the update_size field is protected by the inode lock 288 * and we have that held in exclusive mode. 289 */ 290 if (ip->i_update_size) 291 ip->i_update_size = 0; 292 293 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 294 vecp->i_len = sizeof(xfs_dinode_core_t); 295 vecp++; 296 nvecs++; 297 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 298 299 /* 300 * If this is really an old format inode, then we need to 301 * log it as such. This means that we have to copy the link 302 * count from the new field to the old. We don't have to worry 303 * about the new fields, because nothing trusts them as long as 304 * the old inode version number is there. If the superblock already 305 * has a new version number, then we don't bother converting back. 306 */ 307 mp = ip->i_mount; 308 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 309 XFS_SB_VERSION_HASNLINK(&mp->m_sb)); 310 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 311 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { 312 /* 313 * Convert it back. 314 */ 315 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 316 ip->i_d.di_onlink = ip->i_d.di_nlink; 317 } else { 318 /* 319 * The superblock version has already been bumped, 320 * so just make the conversion to the new inode 321 * format permanent. 322 */ 323 ip->i_d.di_version = XFS_DINODE_VERSION_2; 324 ip->i_d.di_onlink = 0; 325 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 326 } 327 } 328 329 switch (ip->i_d.di_format) { 330 case XFS_DINODE_FMT_EXTENTS: 331 ASSERT(!(iip->ili_format.ilf_fields & 332 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 333 XFS_ILOG_DEV | XFS_ILOG_UUID))); 334 if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { 335 ASSERT(ip->i_df.if_bytes > 0); 336 ASSERT(ip->i_df.if_u1.if_extents != NULL); 337 ASSERT(ip->i_d.di_nextents > 0); 338 ASSERT(iip->ili_extents_buf == NULL); 339 nrecs = ip->i_df.if_bytes / 340 (uint)sizeof(xfs_bmbt_rec_t); 341 ASSERT(nrecs > 0); 342 #if __BYTE_ORDER == __BIG_ENDIAN 343 if (nrecs == ip->i_d.di_nextents) { 344 /* 345 * There are no delayed allocation 346 * extents, so just point to the 347 * real extents array. 348 */ 349 vecp->i_addr = 350 (char *)(ip->i_df.if_u1.if_extents); 351 vecp->i_len = ip->i_df.if_bytes; 352 } else 353 #endif 354 { 355 /* 356 * There are delayed allocation extents 357 * in the inode, or we need to convert 358 * the extents to on disk format. 359 * Use xfs_iextents_copy() 360 * to copy only the real extents into 361 * a separate buffer. We'll free the 362 * buffer in the unlock routine. 363 */ 364 ext_buffer = kmem_alloc(ip->i_df.if_bytes, 365 KM_SLEEP); 366 iip->ili_extents_buf = ext_buffer; 367 vecp->i_addr = (xfs_caddr_t)ext_buffer; 368 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 369 XFS_DATA_FORK); 370 } 371 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 372 iip->ili_format.ilf_dsize = vecp->i_len; 373 vecp++; 374 nvecs++; 375 } 376 break; 377 378 case XFS_DINODE_FMT_BTREE: 379 ASSERT(!(iip->ili_format.ilf_fields & 380 (XFS_ILOG_DDATA | XFS_ILOG_DEXT | 381 XFS_ILOG_DEV | XFS_ILOG_UUID))); 382 if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { 383 ASSERT(ip->i_df.if_broot_bytes > 0); 384 ASSERT(ip->i_df.if_broot != NULL); 385 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 386 vecp->i_len = ip->i_df.if_broot_bytes; 387 vecp++; 388 nvecs++; 389 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 390 } 391 break; 392 393 case XFS_DINODE_FMT_LOCAL: 394 ASSERT(!(iip->ili_format.ilf_fields & 395 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 396 XFS_ILOG_DEV | XFS_ILOG_UUID))); 397 if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { 398 ASSERT(ip->i_df.if_bytes > 0); 399 ASSERT(ip->i_df.if_u1.if_data != NULL); 400 ASSERT(ip->i_d.di_size > 0); 401 402 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; 403 /* 404 * Round i_bytes up to a word boundary. 405 * The underlying memory is guaranteed to 406 * to be there by xfs_idata_realloc(). 407 */ 408 data_bytes = roundup(ip->i_df.if_bytes, 4); 409 ASSERT((ip->i_df.if_real_bytes == 0) || 410 (ip->i_df.if_real_bytes == data_bytes)); 411 vecp->i_len = (int)data_bytes; 412 vecp++; 413 nvecs++; 414 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 415 } 416 break; 417 418 case XFS_DINODE_FMT_DEV: 419 ASSERT(!(iip->ili_format.ilf_fields & 420 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 421 XFS_ILOG_DDATA | XFS_ILOG_UUID))); 422 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 423 iip->ili_format.ilf_u.ilfu_rdev = 424 ip->i_df.if_u2.if_rdev; 425 } 426 break; 427 428 case XFS_DINODE_FMT_UUID: 429 ASSERT(!(iip->ili_format.ilf_fields & 430 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | 431 XFS_ILOG_DDATA | XFS_ILOG_DEV))); 432 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 433 iip->ili_format.ilf_u.ilfu_uuid = 434 ip->i_df.if_u2.if_uuid; 435 } 436 break; 437 438 default: 439 ASSERT(0); 440 break; 441 } 442 443 /* 444 * If there are no attributes associated with the file, 445 * then we're done. 446 * Assert that no attribute-related log flags are set. 447 */ 448 if (!XFS_IFORK_Q(ip)) { 449 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 450 iip->ili_format.ilf_size = nvecs; 451 ASSERT(!(iip->ili_format.ilf_fields & 452 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 453 return; 454 } 455 456 switch (ip->i_d.di_aformat) { 457 case XFS_DINODE_FMT_EXTENTS: 458 ASSERT(!(iip->ili_format.ilf_fields & 459 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); 460 if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { 461 ASSERT(ip->i_afp->if_bytes > 0); 462 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 463 ASSERT(ip->i_d.di_anextents > 0); 464 #ifdef DEBUG 465 nrecs = ip->i_afp->if_bytes / 466 (uint)sizeof(xfs_bmbt_rec_t); 467 #endif 468 ASSERT(nrecs > 0); 469 ASSERT(nrecs == ip->i_d.di_anextents); 470 #if __BYTE_ORDER == __BIG_ENDIAN 471 /* 472 * There are not delayed allocation extents 473 * for attributes, so just point at the array. 474 */ 475 vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); 476 vecp->i_len = ip->i_afp->if_bytes; 477 #else 478 ASSERT(iip->ili_aextents_buf == NULL); 479 /* 480 * Need to endian flip before logging 481 */ 482 ext_buffer = kmem_alloc(ip->i_afp->if_bytes, 483 KM_SLEEP); 484 iip->ili_aextents_buf = ext_buffer; 485 vecp->i_addr = (xfs_caddr_t)ext_buffer; 486 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 487 XFS_ATTR_FORK); 488 #endif 489 iip->ili_format.ilf_asize = vecp->i_len; 490 vecp++; 491 nvecs++; 492 } 493 break; 494 495 case XFS_DINODE_FMT_BTREE: 496 ASSERT(!(iip->ili_format.ilf_fields & 497 (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); 498 if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { 499 ASSERT(ip->i_afp->if_broot_bytes > 0); 500 ASSERT(ip->i_afp->if_broot != NULL); 501 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 502 vecp->i_len = ip->i_afp->if_broot_bytes; 503 vecp++; 504 nvecs++; 505 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 506 } 507 break; 508 509 case XFS_DINODE_FMT_LOCAL: 510 ASSERT(!(iip->ili_format.ilf_fields & 511 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); 512 if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { 513 ASSERT(ip->i_afp->if_bytes > 0); 514 ASSERT(ip->i_afp->if_u1.if_data != NULL); 515 516 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; 517 /* 518 * Round i_bytes up to a word boundary. 519 * The underlying memory is guaranteed to 520 * to be there by xfs_idata_realloc(). 521 */ 522 data_bytes = roundup(ip->i_afp->if_bytes, 4); 523 ASSERT((ip->i_afp->if_real_bytes == 0) || 524 (ip->i_afp->if_real_bytes == data_bytes)); 525 vecp->i_len = (int)data_bytes; 526 vecp++; 527 nvecs++; 528 iip->ili_format.ilf_asize = (unsigned)data_bytes; 529 } 530 break; 531 532 default: 533 ASSERT(0); 534 break; 535 } 536 537 ASSERT(nvecs == iip->ili_item.li_desc->lid_size); 538 iip->ili_format.ilf_size = nvecs; 539 } 540 541 542 /* 543 * This is called to pin the inode associated with the inode log 544 * item in memory so it cannot be written out. Do this by calling 545 * xfs_ipin() to bump the pin count in the inode while holding the 546 * inode pin lock. 547 */ 548 STATIC void 549 xfs_inode_item_pin( 550 xfs_inode_log_item_t *iip) 551 { 552 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); 553 xfs_ipin(iip->ili_inode); 554 } 555 556 557 /* 558 * This is called to unpin the inode associated with the inode log 559 * item which was previously pinned with a call to xfs_inode_item_pin(). 560 * Just call xfs_iunpin() on the inode to do this. 561 */ 562 /* ARGSUSED */ 563 STATIC void 564 xfs_inode_item_unpin( 565 xfs_inode_log_item_t *iip, 566 int stale) 567 { 568 xfs_iunpin(iip->ili_inode); 569 } 570 571 /* ARGSUSED */ 572 STATIC void 573 xfs_inode_item_unpin_remove( 574 xfs_inode_log_item_t *iip, 575 xfs_trans_t *tp) 576 { 577 xfs_iunpin(iip->ili_inode); 578 } 579 580 /* 581 * This is called to attempt to lock the inode associated with this 582 * inode log item, in preparation for the push routine which does the actual 583 * iflush. Don't sleep on the inode lock or the flush lock. 584 * 585 * If the flush lock is already held, indicating that the inode has 586 * been or is in the process of being flushed, then (ideally) we'd like to 587 * see if the inode's buffer is still incore, and if so give it a nudge. 588 * We delay doing so until the pushbuf routine, though, to avoid holding 589 * the AIL lock across a call to the blackhole which is the buffercache. 590 * Also we don't want to sleep in any device strategy routines, which can happen 591 * if we do the subsequent bawrite in here. 592 */ 593 STATIC uint 594 xfs_inode_item_trylock( 595 xfs_inode_log_item_t *iip) 596 { 597 register xfs_inode_t *ip; 598 599 ip = iip->ili_inode; 600 601 if (xfs_ipincount(ip) > 0) { 602 return XFS_ITEM_PINNED; 603 } 604 605 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 606 return XFS_ITEM_LOCKED; 607 } 608 609 if (!xfs_iflock_nowait(ip)) { 610 /* 611 * If someone else isn't already trying to push the inode 612 * buffer, we get to do it. 613 */ 614 if (iip->ili_pushbuf_flag == 0) { 615 iip->ili_pushbuf_flag = 1; 616 #ifdef DEBUG 617 iip->ili_push_owner = get_thread_id(); 618 #endif 619 /* 620 * Inode is left locked in shared mode. 621 * Pushbuf routine gets to unlock it. 622 */ 623 return XFS_ITEM_PUSHBUF; 624 } else { 625 /* 626 * We hold the AIL_LOCK, so we must specify the 627 * NONOTIFY flag so that we won't double trip. 628 */ 629 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 630 return XFS_ITEM_FLUSHING; 631 } 632 /* NOTREACHED */ 633 } 634 635 /* Stale items should force out the iclog */ 636 if (ip->i_flags & XFS_ISTALE) { 637 xfs_ifunlock(ip); 638 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 639 return XFS_ITEM_PINNED; 640 } 641 642 #ifdef DEBUG 643 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 644 ASSERT(iip->ili_format.ilf_fields != 0); 645 ASSERT(iip->ili_logged == 0); 646 ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); 647 } 648 #endif 649 return XFS_ITEM_SUCCESS; 650 } 651 652 /* 653 * Unlock the inode associated with the inode log item. 654 * Clear the fields of the inode and inode log item that 655 * are specific to the current transaction. If the 656 * hold flags is set, do not unlock the inode. 657 */ 658 STATIC void 659 xfs_inode_item_unlock( 660 xfs_inode_log_item_t *iip) 661 { 662 uint hold; 663 uint iolocked; 664 uint lock_flags; 665 xfs_inode_t *ip; 666 667 ASSERT(iip != NULL); 668 ASSERT(iip->ili_inode->i_itemp != NULL); 669 ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); 670 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 671 XFS_ILI_IOLOCKED_EXCL)) || 672 ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); 673 ASSERT((!(iip->ili_inode->i_itemp->ili_flags & 674 XFS_ILI_IOLOCKED_SHARED)) || 675 ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); 676 /* 677 * Clear the transaction pointer in the inode. 678 */ 679 ip = iip->ili_inode; 680 ip->i_transp = NULL; 681 682 /* 683 * If the inode needed a separate buffer with which to log 684 * its extents, then free it now. 685 */ 686 if (iip->ili_extents_buf != NULL) { 687 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); 688 ASSERT(ip->i_d.di_nextents > 0); 689 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); 690 ASSERT(ip->i_df.if_bytes > 0); 691 kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); 692 iip->ili_extents_buf = NULL; 693 } 694 if (iip->ili_aextents_buf != NULL) { 695 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); 696 ASSERT(ip->i_d.di_anextents > 0); 697 ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); 698 ASSERT(ip->i_afp->if_bytes > 0); 699 kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); 700 iip->ili_aextents_buf = NULL; 701 } 702 703 /* 704 * Figure out if we should unlock the inode or not. 705 */ 706 hold = iip->ili_flags & XFS_ILI_HOLD; 707 708 /* 709 * Before clearing out the flags, remember whether we 710 * are holding the inode's IO lock. 711 */ 712 iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; 713 714 /* 715 * Clear out the fields of the inode log item particular 716 * to the current transaction. 717 */ 718 iip->ili_ilock_recur = 0; 719 iip->ili_iolock_recur = 0; 720 iip->ili_flags = 0; 721 722 /* 723 * Unlock the inode if XFS_ILI_HOLD was not set. 724 */ 725 if (!hold) { 726 lock_flags = XFS_ILOCK_EXCL; 727 if (iolocked & XFS_ILI_IOLOCKED_EXCL) { 728 lock_flags |= XFS_IOLOCK_EXCL; 729 } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { 730 lock_flags |= XFS_IOLOCK_SHARED; 731 } 732 xfs_iput(iip->ili_inode, lock_flags); 733 } 734 } 735 736 /* 737 * This is called to find out where the oldest active copy of the 738 * inode log item in the on disk log resides now that the last log 739 * write of it completed at the given lsn. Since we always re-log 740 * all dirty data in an inode, the latest copy in the on disk log 741 * is the only one that matters. Therefore, simply return the 742 * given lsn. 743 */ 744 /*ARGSUSED*/ 745 STATIC xfs_lsn_t 746 xfs_inode_item_committed( 747 xfs_inode_log_item_t *iip, 748 xfs_lsn_t lsn) 749 { 750 return (lsn); 751 } 752 753 /* 754 * The transaction with the inode locked has aborted. The inode 755 * must not be dirty within the transaction (unless we're forcibly 756 * shutting down). We simply unlock just as if the transaction 757 * had been cancelled. 758 */ 759 STATIC void 760 xfs_inode_item_abort( 761 xfs_inode_log_item_t *iip) 762 { 763 xfs_inode_item_unlock(iip); 764 return; 765 } 766 767 768 /* 769 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 770 * failed to get the inode flush lock but did get the inode locked SHARED. 771 * Here we're trying to see if the inode buffer is incore, and if so whether it's 772 * marked delayed write. If that's the case, we'll initiate a bawrite on that 773 * buffer to expedite the process. 774 * 775 * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, 776 * so it is inherently race-y. 777 */ 778 STATIC void 779 xfs_inode_item_pushbuf( 780 xfs_inode_log_item_t *iip) 781 { 782 xfs_inode_t *ip; 783 xfs_mount_t *mp; 784 xfs_buf_t *bp; 785 uint dopush; 786 787 ip = iip->ili_inode; 788 789 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); 790 791 /* 792 * The ili_pushbuf_flag keeps others from 793 * trying to duplicate our effort. 794 */ 795 ASSERT(iip->ili_pushbuf_flag != 0); 796 ASSERT(iip->ili_push_owner == get_thread_id()); 797 798 /* 799 * If flushlock isn't locked anymore, chances are that the 800 * inode flush completed and the inode was taken off the AIL. 801 * So, just get out. 802 */ 803 if ((valusema(&(ip->i_flock)) > 0) || 804 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 805 iip->ili_pushbuf_flag = 0; 806 xfs_iunlock(ip, XFS_ILOCK_SHARED); 807 return; 808 } 809 810 mp = ip->i_mount; 811 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 812 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 813 814 if (bp != NULL) { 815 if (XFS_BUF_ISDELAYWRITE(bp)) { 816 /* 817 * We were racing with iflush because we don't hold 818 * the AIL_LOCK or the flush lock. However, at this point, 819 * we have the buffer, and we know that it's dirty. 820 * So, it's possible that iflush raced with us, and 821 * this item is already taken off the AIL. 822 * If not, we can flush it async. 823 */ 824 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && 825 (valusema(&(ip->i_flock)) <= 0)); 826 iip->ili_pushbuf_flag = 0; 827 xfs_iunlock(ip, XFS_ILOCK_SHARED); 828 xfs_buftrace("INODE ITEM PUSH", bp); 829 if (XFS_BUF_ISPINNED(bp)) { 830 xfs_log_force(mp, (xfs_lsn_t)0, 831 XFS_LOG_FORCE); 832 } 833 if (dopush) { 834 xfs_bawrite(mp, bp); 835 } else { 836 xfs_buf_relse(bp); 837 } 838 } else { 839 iip->ili_pushbuf_flag = 0; 840 xfs_iunlock(ip, XFS_ILOCK_SHARED); 841 xfs_buf_relse(bp); 842 } 843 return; 844 } 845 /* 846 * We have to be careful about resetting pushbuf flag too early (above). 847 * Even though in theory we can do it as soon as we have the buflock, 848 * we don't want others to be doing work needlessly. They'll come to 849 * this function thinking that pushing the buffer is their 850 * responsibility only to find that the buffer is still locked by 851 * another doing the same thing 852 */ 853 iip->ili_pushbuf_flag = 0; 854 xfs_iunlock(ip, XFS_ILOCK_SHARED); 855 return; 856 } 857 858 859 /* 860 * This is called to asynchronously write the inode associated with this 861 * inode log item out to disk. The inode will already have been locked by 862 * a successful call to xfs_inode_item_trylock(). 863 */ 864 STATIC void 865 xfs_inode_item_push( 866 xfs_inode_log_item_t *iip) 867 { 868 xfs_inode_t *ip; 869 870 ip = iip->ili_inode; 871 872 ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); 873 ASSERT(valusema(&(ip->i_flock)) <= 0); 874 /* 875 * Since we were able to lock the inode's flush lock and 876 * we found it on the AIL, the inode must be dirty. This 877 * is because the inode is removed from the AIL while still 878 * holding the flush lock in xfs_iflush_done(). Thus, if 879 * we found it in the AIL and were able to obtain the flush 880 * lock without sleeping, then there must not have been 881 * anyone in the process of flushing the inode. 882 */ 883 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || 884 iip->ili_format.ilf_fields != 0); 885 886 /* 887 * Write out the inode. The completion routine ('iflush_done') will 888 * pull it from the AIL, mark it clean, unlock the flush lock. 889 */ 890 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 891 xfs_iunlock(ip, XFS_ILOCK_SHARED); 892 893 return; 894 } 895 896 /* 897 * XXX rcc - this one really has to do something. Probably needs 898 * to stamp in a new field in the incore inode. 899 */ 900 /* ARGSUSED */ 901 STATIC void 902 xfs_inode_item_committing( 903 xfs_inode_log_item_t *iip, 904 xfs_lsn_t lsn) 905 { 906 iip->ili_last_lsn = lsn; 907 return; 908 } 909 910 /* 911 * This is the ops vector shared by all buf log items. 912 */ 913 STATIC struct xfs_item_ops xfs_inode_item_ops = { 914 .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, 915 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 916 xfs_inode_item_format, 917 .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, 918 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, 919 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) 920 xfs_inode_item_unpin_remove, 921 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, 922 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, 923 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 924 xfs_inode_item_committed, 925 .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, 926 .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, 927 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, 928 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 929 xfs_inode_item_committing 930 }; 931 932 933 /* 934 * Initialize the inode log item for a newly allocated (in-core) inode. 935 */ 936 void 937 xfs_inode_item_init( 938 xfs_inode_t *ip, 939 xfs_mount_t *mp) 940 { 941 xfs_inode_log_item_t *iip; 942 943 ASSERT(ip->i_itemp == NULL); 944 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 945 946 iip->ili_item.li_type = XFS_LI_INODE; 947 iip->ili_item.li_ops = &xfs_inode_item_ops; 948 iip->ili_item.li_mountp = mp; 949 iip->ili_inode = ip; 950 951 /* 952 We have zeroed memory. No need ... 953 iip->ili_extents_buf = NULL; 954 iip->ili_pushbuf_flag = 0; 955 */ 956 957 iip->ili_format.ilf_type = XFS_LI_INODE; 958 iip->ili_format.ilf_ino = ip->i_ino; 959 iip->ili_format.ilf_blkno = ip->i_blkno; 960 iip->ili_format.ilf_len = ip->i_len; 961 iip->ili_format.ilf_boffset = ip->i_boffset; 962 } 963 964 /* 965 * Free the inode log item and any memory hanging off of it. 966 */ 967 void 968 xfs_inode_item_destroy( 969 xfs_inode_t *ip) 970 { 971 #ifdef XFS_TRANS_DEBUG 972 if (ip->i_itemp->ili_root_size != 0) { 973 kmem_free(ip->i_itemp->ili_orig_root, 974 ip->i_itemp->ili_root_size); 975 } 976 #endif 977 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 978 } 979 980 981 /* 982 * This is the inode flushing I/O completion routine. It is called 983 * from interrupt level when the buffer containing the inode is 984 * flushed to disk. It is responsible for removing the inode item 985 * from the AIL if it has not been re-logged, and unlocking the inode's 986 * flush lock. 987 */ 988 /*ARGSUSED*/ 989 void 990 xfs_iflush_done( 991 xfs_buf_t *bp, 992 xfs_inode_log_item_t *iip) 993 { 994 xfs_inode_t *ip; 995 SPLDECL(s); 996 997 ip = iip->ili_inode; 998 999 /* 1000 * We only want to pull the item from the AIL if it is 1001 * actually there and its location in the log has not 1002 * changed since we started the flush. Thus, we only bother 1003 * if the ili_logged flag is set and the inode's lsn has not 1004 * changed. First we check the lsn outside 1005 * the lock since it's cheaper, and then we recheck while 1006 * holding the lock before removing the inode from the AIL. 1007 */ 1008 if (iip->ili_logged && 1009 (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { 1010 AIL_LOCK(ip->i_mount, s); 1011 if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { 1012 /* 1013 * xfs_trans_delete_ail() drops the AIL lock. 1014 */ 1015 xfs_trans_delete_ail(ip->i_mount, 1016 (xfs_log_item_t*)iip, s); 1017 } else { 1018 AIL_UNLOCK(ip->i_mount, s); 1019 } 1020 } 1021 1022 iip->ili_logged = 0; 1023 1024 /* 1025 * Clear the ili_last_fields bits now that we know that the 1026 * data corresponding to them is safely on disk. 1027 */ 1028 iip->ili_last_fields = 0; 1029 1030 /* 1031 * Release the inode's flush lock since we're done with it. 1032 */ 1033 xfs_ifunlock(ip); 1034 1035 return; 1036 } 1037 1038 /* 1039 * This is the inode flushing abort routine. It is called 1040 * from xfs_iflush when the filesystem is shutting down to clean 1041 * up the inode state. 1042 * It is responsible for removing the inode item 1043 * from the AIL if it has not been re-logged, and unlocking the inode's 1044 * flush lock. 1045 */ 1046 void 1047 xfs_iflush_abort( 1048 xfs_inode_t *ip) 1049 { 1050 xfs_inode_log_item_t *iip; 1051 xfs_mount_t *mp; 1052 SPLDECL(s); 1053 1054 iip = ip->i_itemp; 1055 mp = ip->i_mount; 1056 if (iip) { 1057 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1058 AIL_LOCK(mp, s); 1059 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 1060 /* 1061 * xfs_trans_delete_ail() drops the AIL lock. 1062 */ 1063 xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip, 1064 s); 1065 } else 1066 AIL_UNLOCK(mp, s); 1067 } 1068 iip->ili_logged = 0; 1069 /* 1070 * Clear the ili_last_fields bits now that we know that the 1071 * data corresponding to them is safely on disk. 1072 */ 1073 iip->ili_last_fields = 0; 1074 /* 1075 * Clear the inode logging fields so no more flushes are 1076 * attempted. 1077 */ 1078 iip->ili_format.ilf_fields = 0; 1079 } 1080 /* 1081 * Release the inode's flush lock since we're done with it. 1082 */ 1083 xfs_ifunlock(ip); 1084 } 1085 1086 void 1087 xfs_istale_done( 1088 xfs_buf_t *bp, 1089 xfs_inode_log_item_t *iip) 1090 { 1091 xfs_iflush_abort(iip->ili_inode); 1092 } 1093