1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 34 #define MLOG_MASK_PREFIX ML_INODE 35 #include <cluster/masklog.h> 36 37 #include "ocfs2.h" 38 39 #include "alloc.h" 40 #include "aops.h" 41 #include "dir.h" 42 #include "dlmglue.h" 43 #include "extent_map.h" 44 #include "file.h" 45 #include "sysfile.h" 46 #include "inode.h" 47 #include "journal.h" 48 #include "mmap.h" 49 #include "suballoc.h" 50 #include "super.h" 51 52 #include "buffer_head_io.h" 53 54 static int ocfs2_sync_inode(struct inode *inode) 55 { 56 filemap_fdatawrite(inode->i_mapping); 57 return sync_mapping_buffers(inode->i_mapping); 58 } 59 60 static int ocfs2_file_open(struct inode *inode, struct file *file) 61 { 62 int status; 63 int mode = file->f_flags; 64 struct ocfs2_inode_info *oi = OCFS2_I(inode); 65 66 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 67 file->f_dentry->d_name.len, file->f_dentry->d_name.name); 68 69 spin_lock(&oi->ip_lock); 70 71 /* Check that the inode hasn't been wiped from disk by another 72 * node. If it hasn't then we're safe as long as we hold the 73 * spin lock until our increment of open count. */ 74 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 75 spin_unlock(&oi->ip_lock); 76 77 status = -ENOENT; 78 goto leave; 79 } 80 81 if (mode & O_DIRECT) 82 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 83 84 oi->ip_open_count++; 85 spin_unlock(&oi->ip_lock); 86 status = 0; 87 leave: 88 mlog_exit(status); 89 return status; 90 } 91 92 static int ocfs2_file_release(struct inode *inode, struct file *file) 93 { 94 struct ocfs2_inode_info *oi = OCFS2_I(inode); 95 96 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 97 file->f_dentry->d_name.len, 98 file->f_dentry->d_name.name); 99 100 spin_lock(&oi->ip_lock); 101 if (!--oi->ip_open_count) 102 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 103 spin_unlock(&oi->ip_lock); 104 105 mlog_exit(0); 106 107 return 0; 108 } 109 110 static int ocfs2_sync_file(struct file *file, 111 struct dentry *dentry, 112 int datasync) 113 { 114 int err = 0; 115 journal_t *journal; 116 struct inode *inode = dentry->d_inode; 117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 118 119 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 120 dentry->d_name.len, dentry->d_name.name); 121 122 err = ocfs2_sync_inode(dentry->d_inode); 123 if (err) 124 goto bail; 125 126 journal = osb->journal->j_journal; 127 err = journal_force_commit(journal); 128 129 bail: 130 mlog_exit(err); 131 132 return (err < 0) ? -EIO : 0; 133 } 134 135 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, 136 struct inode *inode, 137 struct buffer_head *fe_bh, 138 u64 new_i_size) 139 { 140 int status; 141 142 mlog_entry_void(); 143 i_size_write(inode, new_i_size); 144 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 145 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 146 147 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 148 if (status < 0) { 149 mlog_errno(status); 150 goto bail; 151 } 152 153 bail: 154 mlog_exit(status); 155 return status; 156 } 157 158 static int ocfs2_simple_size_update(struct inode *inode, 159 struct buffer_head *di_bh, 160 u64 new_i_size) 161 { 162 int ret; 163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 164 struct ocfs2_journal_handle *handle = NULL; 165 166 handle = ocfs2_start_trans(osb, NULL, 167 OCFS2_INODE_UPDATE_CREDITS); 168 if (handle == NULL) { 169 ret = -ENOMEM; 170 mlog_errno(ret); 171 goto out; 172 } 173 174 ret = ocfs2_set_inode_size(handle, inode, di_bh, 175 new_i_size); 176 if (ret < 0) 177 mlog_errno(ret); 178 179 ocfs2_commit_trans(handle); 180 out: 181 return ret; 182 } 183 184 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 185 struct inode *inode, 186 struct buffer_head *fe_bh, 187 u64 new_i_size) 188 { 189 int status; 190 struct ocfs2_journal_handle *handle; 191 192 mlog_entry_void(); 193 194 /* TODO: This needs to actually orphan the inode in this 195 * transaction. */ 196 197 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 198 if (IS_ERR(handle)) { 199 status = PTR_ERR(handle); 200 mlog_errno(status); 201 goto out; 202 } 203 204 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 205 if (status < 0) 206 mlog_errno(status); 207 208 ocfs2_commit_trans(handle); 209 out: 210 mlog_exit(status); 211 return status; 212 } 213 214 static int ocfs2_truncate_file(struct inode *inode, 215 struct buffer_head *di_bh, 216 u64 new_i_size) 217 { 218 int status = 0; 219 struct ocfs2_dinode *fe = NULL; 220 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 221 struct ocfs2_truncate_context *tc = NULL; 222 223 mlog_entry("(inode = %llu, new_i_size = %llu\n", 224 (unsigned long long)OCFS2_I(inode)->ip_blkno, 225 (unsigned long long)new_i_size); 226 227 truncate_inode_pages(inode->i_mapping, new_i_size); 228 229 fe = (struct ocfs2_dinode *) di_bh->b_data; 230 if (!OCFS2_IS_VALID_DINODE(fe)) { 231 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 232 status = -EIO; 233 goto bail; 234 } 235 236 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 237 "Inode %llu, inode i_size = %lld != di " 238 "i_size = %llu, i_flags = 0x%x\n", 239 (unsigned long long)OCFS2_I(inode)->ip_blkno, 240 i_size_read(inode), 241 (unsigned long long)le64_to_cpu(fe->i_size), 242 le32_to_cpu(fe->i_flags)); 243 244 if (new_i_size > le64_to_cpu(fe->i_size)) { 245 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 246 (unsigned long long)le64_to_cpu(fe->i_size), 247 (unsigned long long)new_i_size); 248 status = -EINVAL; 249 mlog_errno(status); 250 goto bail; 251 } 252 253 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 254 (unsigned long long)le64_to_cpu(fe->i_blkno), 255 (unsigned long long)le64_to_cpu(fe->i_size), 256 (unsigned long long)new_i_size); 257 258 /* lets handle the simple truncate cases before doing any more 259 * cluster locking. */ 260 if (new_i_size == le64_to_cpu(fe->i_size)) 261 goto bail; 262 263 if (le32_to_cpu(fe->i_clusters) == 264 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 265 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 266 fe->i_clusters); 267 /* No allocation change is required, so lets fast path 268 * this truncate. */ 269 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 270 if (status < 0) 271 mlog_errno(status); 272 goto bail; 273 } 274 275 /* This forces other nodes to sync and drop their pages */ 276 status = ocfs2_data_lock(inode, 1); 277 if (status < 0) { 278 mlog_errno(status); 279 goto bail; 280 } 281 ocfs2_data_unlock(inode, 1); 282 283 /* alright, we're going to need to do a full blown alloc size 284 * change. Orphan the inode so that recovery can complete the 285 * truncate if necessary. This does the task of marking 286 * i_size. */ 287 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 288 if (status < 0) { 289 mlog_errno(status); 290 goto bail; 291 } 292 293 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 294 if (status < 0) { 295 mlog_errno(status); 296 goto bail; 297 } 298 299 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 300 if (status < 0) { 301 mlog_errno(status); 302 goto bail; 303 } 304 305 /* TODO: orphan dir cleanup here. */ 306 bail: 307 308 mlog_exit(status); 309 return status; 310 } 311 312 /* 313 * extend allocation only here. 314 * we'll update all the disk stuff, and oip->alloc_size 315 * 316 * expect stuff to be locked, a transaction started and enough data / 317 * metadata reservations in the contexts. 318 * 319 * Will return -EAGAIN, and a reason if a restart is needed. 320 * If passed in, *reason will always be set, even in error. 321 */ 322 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 323 struct inode *inode, 324 u32 clusters_to_add, 325 struct buffer_head *fe_bh, 326 struct ocfs2_journal_handle *handle, 327 struct ocfs2_alloc_context *data_ac, 328 struct ocfs2_alloc_context *meta_ac, 329 enum ocfs2_alloc_restarted *reason_ret) 330 { 331 int status = 0; 332 int free_extents; 333 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 334 enum ocfs2_alloc_restarted reason = RESTART_NONE; 335 u32 bit_off, num_bits; 336 u64 block; 337 338 BUG_ON(!clusters_to_add); 339 340 free_extents = ocfs2_num_free_extents(osb, inode, fe); 341 if (free_extents < 0) { 342 status = free_extents; 343 mlog_errno(status); 344 goto leave; 345 } 346 347 /* there are two cases which could cause us to EAGAIN in the 348 * we-need-more-metadata case: 349 * 1) we haven't reserved *any* 350 * 2) we are so fragmented, we've needed to add metadata too 351 * many times. */ 352 if (!free_extents && !meta_ac) { 353 mlog(0, "we haven't reserved any metadata!\n"); 354 status = -EAGAIN; 355 reason = RESTART_META; 356 goto leave; 357 } else if ((!free_extents) 358 && (ocfs2_alloc_context_bits_left(meta_ac) 359 < ocfs2_extend_meta_needed(fe))) { 360 mlog(0, "filesystem is really fragmented...\n"); 361 status = -EAGAIN; 362 reason = RESTART_META; 363 goto leave; 364 } 365 366 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 367 &bit_off, &num_bits); 368 if (status < 0) { 369 if (status != -ENOSPC) 370 mlog_errno(status); 371 goto leave; 372 } 373 374 BUG_ON(num_bits > clusters_to_add); 375 376 /* reserve our write early -- insert_extent may update the inode */ 377 status = ocfs2_journal_access(handle, inode, fe_bh, 378 OCFS2_JOURNAL_ACCESS_WRITE); 379 if (status < 0) { 380 mlog_errno(status); 381 goto leave; 382 } 383 384 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 385 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 386 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 387 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 388 num_bits, meta_ac); 389 if (status < 0) { 390 mlog_errno(status); 391 goto leave; 392 } 393 394 le32_add_cpu(&fe->i_clusters, num_bits); 395 spin_lock(&OCFS2_I(inode)->ip_lock); 396 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 397 spin_unlock(&OCFS2_I(inode)->ip_lock); 398 399 status = ocfs2_journal_dirty(handle, fe_bh); 400 if (status < 0) { 401 mlog_errno(status); 402 goto leave; 403 } 404 405 clusters_to_add -= num_bits; 406 407 if (clusters_to_add) { 408 mlog(0, "need to alloc once more, clusters = %u, wanted = " 409 "%u\n", fe->i_clusters, clusters_to_add); 410 status = -EAGAIN; 411 reason = RESTART_TRANS; 412 } 413 414 leave: 415 mlog_exit(status); 416 if (reason_ret) 417 *reason_ret = reason; 418 return status; 419 } 420 421 static int ocfs2_extend_allocation(struct inode *inode, 422 u32 clusters_to_add) 423 { 424 int status = 0; 425 int restart_func = 0; 426 int drop_alloc_sem = 0; 427 int credits, num_free_extents; 428 u32 prev_clusters; 429 struct buffer_head *bh = NULL; 430 struct ocfs2_dinode *fe = NULL; 431 struct ocfs2_journal_handle *handle = NULL; 432 struct ocfs2_alloc_context *data_ac = NULL; 433 struct ocfs2_alloc_context *meta_ac = NULL; 434 enum ocfs2_alloc_restarted why; 435 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 436 437 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 438 439 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 440 OCFS2_BH_CACHED, inode); 441 if (status < 0) { 442 mlog_errno(status); 443 goto leave; 444 } 445 446 fe = (struct ocfs2_dinode *) bh->b_data; 447 if (!OCFS2_IS_VALID_DINODE(fe)) { 448 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 449 status = -EIO; 450 goto leave; 451 } 452 453 restart_all: 454 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 455 456 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " 457 "clusters_to_add = %u\n", 458 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 459 fe->i_clusters, clusters_to_add); 460 461 handle = ocfs2_alloc_handle(osb); 462 if (handle == NULL) { 463 status = -ENOMEM; 464 mlog_errno(status); 465 goto leave; 466 } 467 468 num_free_extents = ocfs2_num_free_extents(osb, 469 inode, 470 fe); 471 if (num_free_extents < 0) { 472 status = num_free_extents; 473 mlog_errno(status); 474 goto leave; 475 } 476 477 if (!num_free_extents) { 478 status = ocfs2_reserve_new_metadata(osb, 479 handle, 480 fe, 481 &meta_ac); 482 if (status < 0) { 483 if (status != -ENOSPC) 484 mlog_errno(status); 485 goto leave; 486 } 487 } 488 489 status = ocfs2_reserve_clusters(osb, 490 handle, 491 clusters_to_add, 492 &data_ac); 493 if (status < 0) { 494 if (status != -ENOSPC) 495 mlog_errno(status); 496 goto leave; 497 } 498 499 /* blocks peope in read/write from reading our allocation 500 * until we're done changing it. We depend on i_mutex to block 501 * other extend/truncate calls while we're here. Ordering wrt 502 * start_trans is important here -- always do it before! */ 503 down_write(&OCFS2_I(inode)->ip_alloc_sem); 504 drop_alloc_sem = 1; 505 506 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 507 handle = ocfs2_start_trans(osb, handle, credits); 508 if (IS_ERR(handle)) { 509 status = PTR_ERR(handle); 510 handle = NULL; 511 mlog_errno(status); 512 goto leave; 513 } 514 515 restarted_transaction: 516 /* reserve a write to the file entry early on - that we if we 517 * run out of credits in the allocation path, we can still 518 * update i_size. */ 519 status = ocfs2_journal_access(handle, inode, bh, 520 OCFS2_JOURNAL_ACCESS_WRITE); 521 if (status < 0) { 522 mlog_errno(status); 523 goto leave; 524 } 525 526 prev_clusters = OCFS2_I(inode)->ip_clusters; 527 528 status = ocfs2_do_extend_allocation(osb, 529 inode, 530 clusters_to_add, 531 bh, 532 handle, 533 data_ac, 534 meta_ac, 535 &why); 536 if ((status < 0) && (status != -EAGAIN)) { 537 if (status != -ENOSPC) 538 mlog_errno(status); 539 goto leave; 540 } 541 542 status = ocfs2_journal_dirty(handle, bh); 543 if (status < 0) { 544 mlog_errno(status); 545 goto leave; 546 } 547 548 spin_lock(&OCFS2_I(inode)->ip_lock); 549 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 550 spin_unlock(&OCFS2_I(inode)->ip_lock); 551 552 if (why != RESTART_NONE && clusters_to_add) { 553 if (why == RESTART_META) { 554 mlog(0, "restarting function.\n"); 555 restart_func = 1; 556 } else { 557 BUG_ON(why != RESTART_TRANS); 558 559 mlog(0, "restarting transaction.\n"); 560 /* TODO: This can be more intelligent. */ 561 credits = ocfs2_calc_extend_credits(osb->sb, 562 fe, 563 clusters_to_add); 564 status = ocfs2_extend_trans(handle, credits); 565 if (status < 0) { 566 /* handle still has to be committed at 567 * this point. */ 568 status = -ENOMEM; 569 mlog_errno(status); 570 goto leave; 571 } 572 goto restarted_transaction; 573 } 574 } 575 576 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 577 fe->i_clusters, (unsigned long long)fe->i_size); 578 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 579 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 580 581 leave: 582 if (drop_alloc_sem) { 583 up_write(&OCFS2_I(inode)->ip_alloc_sem); 584 drop_alloc_sem = 0; 585 } 586 if (handle) { 587 ocfs2_commit_trans(handle); 588 handle = NULL; 589 } 590 if (data_ac) { 591 ocfs2_free_alloc_context(data_ac); 592 data_ac = NULL; 593 } 594 if (meta_ac) { 595 ocfs2_free_alloc_context(meta_ac); 596 meta_ac = NULL; 597 } 598 if ((!status) && restart_func) { 599 restart_func = 0; 600 goto restart_all; 601 } 602 if (bh) { 603 brelse(bh); 604 bh = NULL; 605 } 606 607 mlog_exit(status); 608 return status; 609 } 610 611 /* Some parts of this taken from generic_cont_expand, which turned out 612 * to be too fragile to do exactly what we need without us having to 613 * worry about recursive locking in ->commit_write(). */ 614 static int ocfs2_write_zero_page(struct inode *inode, 615 u64 size) 616 { 617 struct address_space *mapping = inode->i_mapping; 618 struct page *page; 619 unsigned long index; 620 unsigned int offset; 621 struct ocfs2_journal_handle *handle = NULL; 622 int ret; 623 624 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 625 /* ugh. in prepare/commit_write, if from==to==start of block, we 626 ** skip the prepare. make sure we never send an offset for the start 627 ** of a block 628 */ 629 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 630 offset++; 631 } 632 index = size >> PAGE_CACHE_SHIFT; 633 634 page = grab_cache_page(mapping, index); 635 if (!page) { 636 ret = -ENOMEM; 637 mlog_errno(ret); 638 goto out; 639 } 640 641 ret = ocfs2_prepare_write(NULL, page, offset, offset); 642 if (ret < 0) { 643 mlog_errno(ret); 644 goto out_unlock; 645 } 646 647 if (ocfs2_should_order_data(inode)) { 648 handle = ocfs2_start_walk_page_trans(inode, page, offset, 649 offset); 650 if (IS_ERR(handle)) { 651 ret = PTR_ERR(handle); 652 handle = NULL; 653 goto out_unlock; 654 } 655 } 656 657 /* must not update i_size! */ 658 ret = block_commit_write(page, offset, offset); 659 if (ret < 0) 660 mlog_errno(ret); 661 else 662 ret = 0; 663 664 if (handle) 665 ocfs2_commit_trans(handle); 666 out_unlock: 667 unlock_page(page); 668 page_cache_release(page); 669 out: 670 return ret; 671 } 672 673 static int ocfs2_zero_extend(struct inode *inode, 674 u64 zero_to_size) 675 { 676 int ret = 0; 677 u64 start_off; 678 struct super_block *sb = inode->i_sb; 679 680 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 681 while (start_off < zero_to_size) { 682 ret = ocfs2_write_zero_page(inode, start_off); 683 if (ret < 0) { 684 mlog_errno(ret); 685 goto out; 686 } 687 688 start_off += sb->s_blocksize; 689 } 690 691 out: 692 return ret; 693 } 694 695 static int ocfs2_extend_file(struct inode *inode, 696 struct buffer_head *di_bh, 697 u64 new_i_size) 698 { 699 int ret = 0; 700 u32 clusters_to_add; 701 702 /* setattr sometimes calls us like this. */ 703 if (new_i_size == 0) 704 goto out; 705 706 if (i_size_read(inode) == new_i_size) 707 goto out; 708 BUG_ON(new_i_size < i_size_read(inode)); 709 710 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 711 OCFS2_I(inode)->ip_clusters; 712 713 if (clusters_to_add) { 714 ret = ocfs2_extend_allocation(inode, clusters_to_add); 715 if (ret < 0) { 716 mlog_errno(ret); 717 goto out; 718 } 719 720 ret = ocfs2_zero_extend(inode, new_i_size); 721 if (ret < 0) { 722 mlog_errno(ret); 723 goto out; 724 } 725 } 726 727 /* No allocation required, we just use this helper to 728 * do a trivial update of i_size. */ 729 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 730 if (ret < 0) { 731 mlog_errno(ret); 732 goto out; 733 } 734 735 out: 736 return ret; 737 } 738 739 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 740 { 741 int status = 0, size_change; 742 struct inode *inode = dentry->d_inode; 743 struct super_block *sb = inode->i_sb; 744 struct ocfs2_super *osb = OCFS2_SB(sb); 745 struct buffer_head *bh = NULL; 746 struct ocfs2_journal_handle *handle = NULL; 747 748 mlog_entry("(0x%p, '%.*s')\n", dentry, 749 dentry->d_name.len, dentry->d_name.name); 750 751 if (attr->ia_valid & ATTR_MODE) 752 mlog(0, "mode change: %d\n", attr->ia_mode); 753 if (attr->ia_valid & ATTR_UID) 754 mlog(0, "uid change: %d\n", attr->ia_uid); 755 if (attr->ia_valid & ATTR_GID) 756 mlog(0, "gid change: %d\n", attr->ia_gid); 757 if (attr->ia_valid & ATTR_SIZE) 758 mlog(0, "size change...\n"); 759 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 760 mlog(0, "time change...\n"); 761 762 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 763 | ATTR_GID | ATTR_UID | ATTR_MODE) 764 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 765 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 766 return 0; 767 } 768 769 status = inode_change_ok(inode, attr); 770 if (status) 771 return status; 772 773 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 774 if (size_change) { 775 status = ocfs2_rw_lock(inode, 1); 776 if (status < 0) { 777 mlog_errno(status); 778 goto bail; 779 } 780 } 781 782 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 783 if (status < 0) { 784 if (status != -ENOENT) 785 mlog_errno(status); 786 goto bail_unlock_rw; 787 } 788 789 if (size_change && attr->ia_size != i_size_read(inode)) { 790 if (i_size_read(inode) > attr->ia_size) 791 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 792 else 793 status = ocfs2_extend_file(inode, bh, attr->ia_size); 794 if (status < 0) { 795 if (status != -ENOSPC) 796 mlog_errno(status); 797 status = -ENOSPC; 798 goto bail_unlock; 799 } 800 } 801 802 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 803 if (IS_ERR(handle)) { 804 status = PTR_ERR(handle); 805 mlog_errno(status); 806 goto bail_unlock; 807 } 808 809 status = inode_setattr(inode, attr); 810 if (status < 0) { 811 mlog_errno(status); 812 goto bail_commit; 813 } 814 815 status = ocfs2_mark_inode_dirty(handle, inode, bh); 816 if (status < 0) 817 mlog_errno(status); 818 819 bail_commit: 820 ocfs2_commit_trans(handle); 821 bail_unlock: 822 ocfs2_meta_unlock(inode, 1); 823 bail_unlock_rw: 824 if (size_change) 825 ocfs2_rw_unlock(inode, 1); 826 bail: 827 if (bh) 828 brelse(bh); 829 830 mlog_exit(status); 831 return status; 832 } 833 834 int ocfs2_getattr(struct vfsmount *mnt, 835 struct dentry *dentry, 836 struct kstat *stat) 837 { 838 struct inode *inode = dentry->d_inode; 839 struct super_block *sb = dentry->d_inode->i_sb; 840 struct ocfs2_super *osb = sb->s_fs_info; 841 int err; 842 843 mlog_entry_void(); 844 845 err = ocfs2_inode_revalidate(dentry); 846 if (err) { 847 if (err != -ENOENT) 848 mlog_errno(err); 849 goto bail; 850 } 851 852 generic_fillattr(inode, stat); 853 854 /* We set the blksize from the cluster size for performance */ 855 stat->blksize = osb->s_clustersize; 856 857 bail: 858 mlog_exit(err); 859 860 return err; 861 } 862 863 static int ocfs2_write_remove_suid(struct inode *inode) 864 { 865 int ret; 866 struct buffer_head *bh = NULL; 867 struct ocfs2_inode_info *oi = OCFS2_I(inode); 868 struct ocfs2_journal_handle *handle; 869 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 870 struct ocfs2_dinode *di; 871 872 mlog_entry("(Inode %llu, mode 0%o)\n", 873 (unsigned long long)oi->ip_blkno, inode->i_mode); 874 875 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 876 if (handle == NULL) { 877 ret = -ENOMEM; 878 mlog_errno(ret); 879 goto out; 880 } 881 882 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 883 if (ret < 0) { 884 mlog_errno(ret); 885 goto out_trans; 886 } 887 888 ret = ocfs2_journal_access(handle, inode, bh, 889 OCFS2_JOURNAL_ACCESS_WRITE); 890 if (ret < 0) { 891 mlog_errno(ret); 892 goto out_bh; 893 } 894 895 inode->i_mode &= ~S_ISUID; 896 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 897 inode->i_mode &= ~S_ISGID; 898 899 di = (struct ocfs2_dinode *) bh->b_data; 900 di->i_mode = cpu_to_le16(inode->i_mode); 901 902 ret = ocfs2_journal_dirty(handle, bh); 903 if (ret < 0) 904 mlog_errno(ret); 905 out_bh: 906 brelse(bh); 907 out_trans: 908 ocfs2_commit_trans(handle); 909 out: 910 mlog_exit(ret); 911 return ret; 912 } 913 914 static inline int ocfs2_write_should_remove_suid(struct inode *inode) 915 { 916 mode_t mode = inode->i_mode; 917 918 if (!capable(CAP_FSETID)) { 919 if (unlikely(mode & S_ISUID)) 920 return 1; 921 922 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 923 return 1; 924 } 925 return 0; 926 } 927 928 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 929 const char __user *buf, 930 size_t count, 931 loff_t pos) 932 { 933 struct iovec local_iov = { .iov_base = (void __user *)buf, 934 .iov_len = count }; 935 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; 936 u32 clusters; 937 struct file *filp = iocb->ki_filp; 938 struct inode *inode = filp->f_dentry->d_inode; 939 loff_t newsize, saved_pos; 940 941 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 942 (unsigned int)count, 943 filp->f_dentry->d_name.len, 944 filp->f_dentry->d_name.name); 945 946 /* happy write of zero bytes */ 947 if (count == 0) 948 return 0; 949 950 if (!inode) { 951 mlog(0, "bad inode\n"); 952 return -EIO; 953 } 954 955 mutex_lock(&inode->i_mutex); 956 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 957 if (filp->f_flags & O_DIRECT) { 958 have_alloc_sem = 1; 959 down_read(&inode->i_alloc_sem); 960 } 961 962 /* concurrent O_DIRECT writes are allowed */ 963 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 964 ret = ocfs2_rw_lock(inode, rw_level); 965 if (ret < 0) { 966 rw_level = -1; 967 mlog_errno(ret); 968 goto out; 969 } 970 971 /* 972 * We sample i_size under a read level meta lock to see if our write 973 * is extending the file, if it is we back off and get a write level 974 * meta lock. 975 */ 976 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; 977 for(;;) { 978 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); 979 if (ret < 0) { 980 meta_level = -1; 981 mlog_errno(ret); 982 goto out; 983 } 984 985 /* Clear suid / sgid if necessary. We do this here 986 * instead of later in the write path because 987 * remove_suid() calls ->setattr without any hint that 988 * we may have already done our cluster locking. Since 989 * ocfs2_setattr() *must* take cluster locks to 990 * proceeed, this will lead us to recursively lock the 991 * inode. There's also the dinode i_size state which 992 * can be lost via setattr during extending writes (we 993 * set inode->i_size at the end of a write. */ 994 if (ocfs2_write_should_remove_suid(inode)) { 995 if (meta_level == 0) { 996 ocfs2_meta_unlock(inode, meta_level); 997 meta_level = 1; 998 continue; 999 } 1000 1001 ret = ocfs2_write_remove_suid(inode); 1002 if (ret < 0) { 1003 mlog_errno(ret); 1004 goto out; 1005 } 1006 } 1007 1008 /* work on a copy of ppos until we're sure that we won't have 1009 * to recalculate it due to relocking. */ 1010 if (filp->f_flags & O_APPEND) { 1011 saved_pos = i_size_read(inode); 1012 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1013 } else { 1014 saved_pos = iocb->ki_pos; 1015 } 1016 newsize = count + saved_pos; 1017 1018 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1019 (long long) saved_pos, (long long) newsize, 1020 (long long) i_size_read(inode)); 1021 1022 /* No need for a higher level metadata lock if we're 1023 * never going past i_size. */ 1024 if (newsize <= i_size_read(inode)) 1025 break; 1026 1027 if (meta_level == 0) { 1028 ocfs2_meta_unlock(inode, meta_level); 1029 meta_level = 1; 1030 continue; 1031 } 1032 1033 spin_lock(&OCFS2_I(inode)->ip_lock); 1034 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1035 OCFS2_I(inode)->ip_clusters; 1036 spin_unlock(&OCFS2_I(inode)->ip_lock); 1037 1038 mlog(0, "Writing at EOF, may need more allocation: " 1039 "i_size = %lld, newsize = %lld, need %u clusters\n", 1040 (long long) i_size_read(inode), (long long) newsize, 1041 clusters); 1042 1043 /* We only want to continue the rest of this loop if 1044 * our extend will actually require more 1045 * allocation. */ 1046 if (!clusters) 1047 break; 1048 1049 ret = ocfs2_extend_allocation(inode, clusters); 1050 if (ret < 0) { 1051 if (ret != -ENOSPC) 1052 mlog_errno(ret); 1053 goto out; 1054 } 1055 1056 /* Fill any holes which would've been created by this 1057 * write. If we're O_APPEND, this will wind up 1058 * (correctly) being a noop. */ 1059 ret = ocfs2_zero_extend(inode, (u64) newsize - count); 1060 if (ret < 0) { 1061 mlog_errno(ret); 1062 goto out; 1063 } 1064 break; 1065 } 1066 1067 /* ok, we're done with i_size and alloc work */ 1068 iocb->ki_pos = saved_pos; 1069 ocfs2_meta_unlock(inode, meta_level); 1070 meta_level = -1; 1071 1072 /* communicate with ocfs2_dio_end_io */ 1073 ocfs2_iocb_set_rw_locked(iocb); 1074 1075 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1076 1077 /* buffered aio wouldn't have proper lock coverage today */ 1078 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1079 1080 /* 1081 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1082 * function pointer which is called when o_direct io completes so that 1083 * it can unlock our rw lock. (it's the clustered equivalent of 1084 * i_alloc_sem; protects truncate from racing with pending ios). 1085 * Unfortunately there are error cases which call end_io and others 1086 * that don't. so we don't have to unlock the rw_lock if either an 1087 * async dio is going to do it in the future or an end_io after an 1088 * error has already done it. 1089 */ 1090 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1091 rw_level = -1; 1092 have_alloc_sem = 0; 1093 } 1094 1095 out: 1096 if (meta_level != -1) 1097 ocfs2_meta_unlock(inode, meta_level); 1098 if (have_alloc_sem) 1099 up_read(&inode->i_alloc_sem); 1100 if (rw_level != -1) 1101 ocfs2_rw_unlock(inode, rw_level); 1102 mutex_unlock(&inode->i_mutex); 1103 1104 mlog_exit(ret); 1105 return ret; 1106 } 1107 1108 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1109 char __user *buf, 1110 size_t count, 1111 loff_t pos) 1112 { 1113 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1114 struct file *filp = iocb->ki_filp; 1115 struct inode *inode = filp->f_dentry->d_inode; 1116 1117 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1118 (unsigned int)count, 1119 filp->f_dentry->d_name.len, 1120 filp->f_dentry->d_name.name); 1121 1122 if (!inode) { 1123 ret = -EINVAL; 1124 mlog_errno(ret); 1125 goto bail; 1126 } 1127 1128 /* 1129 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1130 * need locks to protect pending reads from racing with truncate. 1131 */ 1132 if (filp->f_flags & O_DIRECT) { 1133 down_read(&inode->i_alloc_sem); 1134 have_alloc_sem = 1; 1135 1136 ret = ocfs2_rw_lock(inode, 0); 1137 if (ret < 0) { 1138 mlog_errno(ret); 1139 goto bail; 1140 } 1141 rw_level = 0; 1142 /* communicate with ocfs2_dio_end_io */ 1143 ocfs2_iocb_set_rw_locked(iocb); 1144 } 1145 1146 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); 1147 if (ret == -EINVAL) 1148 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1149 1150 /* buffered aio wouldn't have proper lock coverage today */ 1151 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1152 1153 /* see ocfs2_file_aio_write */ 1154 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1155 rw_level = -1; 1156 have_alloc_sem = 0; 1157 } 1158 1159 bail: 1160 if (have_alloc_sem) 1161 up_read(&inode->i_alloc_sem); 1162 if (rw_level != -1) 1163 ocfs2_rw_unlock(inode, rw_level); 1164 mlog_exit(ret); 1165 1166 return ret; 1167 } 1168 1169 struct inode_operations ocfs2_file_iops = { 1170 .setattr = ocfs2_setattr, 1171 .getattr = ocfs2_getattr, 1172 }; 1173 1174 struct inode_operations ocfs2_special_file_iops = { 1175 .setattr = ocfs2_setattr, 1176 .getattr = ocfs2_getattr, 1177 }; 1178 1179 const struct file_operations ocfs2_fops = { 1180 .read = do_sync_read, 1181 .write = do_sync_write, 1182 .sendfile = generic_file_sendfile, 1183 .mmap = ocfs2_mmap, 1184 .fsync = ocfs2_sync_file, 1185 .release = ocfs2_file_release, 1186 .open = ocfs2_file_open, 1187 .aio_read = ocfs2_file_aio_read, 1188 .aio_write = ocfs2_file_aio_write, 1189 }; 1190 1191 const struct file_operations ocfs2_dops = { 1192 .read = generic_read_dir, 1193 .readdir = ocfs2_readdir, 1194 .fsync = ocfs2_sync_file, 1195 }; 1196