1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 34 #define MLOG_MASK_PREFIX ML_INODE 35 #include <cluster/masklog.h> 36 37 #include "ocfs2.h" 38 39 #include "alloc.h" 40 #include "aops.h" 41 #include "dir.h" 42 #include "dlmglue.h" 43 #include "extent_map.h" 44 #include "file.h" 45 #include "sysfile.h" 46 #include "inode.h" 47 #include "journal.h" 48 #include "mmap.h" 49 #include "suballoc.h" 50 #include "super.h" 51 52 #include "buffer_head_io.h" 53 54 static int ocfs2_sync_inode(struct inode *inode) 55 { 56 filemap_fdatawrite(inode->i_mapping); 57 return sync_mapping_buffers(inode->i_mapping); 58 } 59 60 static int ocfs2_file_open(struct inode *inode, struct file *file) 61 { 62 int status; 63 int mode = file->f_flags; 64 struct ocfs2_inode_info *oi = OCFS2_I(inode); 65 66 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 67 file->f_dentry->d_name.len, file->f_dentry->d_name.name); 68 69 spin_lock(&oi->ip_lock); 70 71 /* Check that the inode hasn't been wiped from disk by another 72 * node. If it hasn't then we're safe as long as we hold the 73 * spin lock until our increment of open count. */ 74 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 75 spin_unlock(&oi->ip_lock); 76 77 status = -ENOENT; 78 goto leave; 79 } 80 81 if (mode & O_DIRECT) 82 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 83 84 oi->ip_open_count++; 85 spin_unlock(&oi->ip_lock); 86 status = 0; 87 leave: 88 mlog_exit(status); 89 return status; 90 } 91 92 static int ocfs2_file_release(struct inode *inode, struct file *file) 93 { 94 struct ocfs2_inode_info *oi = OCFS2_I(inode); 95 96 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 97 file->f_dentry->d_name.len, 98 file->f_dentry->d_name.name); 99 100 spin_lock(&oi->ip_lock); 101 if (!--oi->ip_open_count) 102 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 103 spin_unlock(&oi->ip_lock); 104 105 mlog_exit(0); 106 107 return 0; 108 } 109 110 static int ocfs2_sync_file(struct file *file, 111 struct dentry *dentry, 112 int datasync) 113 { 114 int err = 0; 115 journal_t *journal; 116 struct inode *inode = dentry->d_inode; 117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 118 119 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 120 dentry->d_name.len, dentry->d_name.name); 121 122 err = ocfs2_sync_inode(dentry->d_inode); 123 if (err) 124 goto bail; 125 126 journal = osb->journal->j_journal; 127 err = journal_force_commit(journal); 128 129 bail: 130 mlog_exit(err); 131 132 return (err < 0) ? -EIO : 0; 133 } 134 135 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, 136 struct inode *inode, 137 struct buffer_head *fe_bh, 138 u64 new_i_size) 139 { 140 int status; 141 142 mlog_entry_void(); 143 i_size_write(inode, new_i_size); 144 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 145 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 146 147 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 148 if (status < 0) { 149 mlog_errno(status); 150 goto bail; 151 } 152 153 bail: 154 mlog_exit(status); 155 return status; 156 } 157 158 static int ocfs2_simple_size_update(struct inode *inode, 159 struct buffer_head *di_bh, 160 u64 new_i_size) 161 { 162 int ret; 163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 164 struct ocfs2_journal_handle *handle = NULL; 165 166 handle = ocfs2_start_trans(osb, NULL, 167 OCFS2_INODE_UPDATE_CREDITS); 168 if (handle == NULL) { 169 ret = -ENOMEM; 170 mlog_errno(ret); 171 goto out; 172 } 173 174 ret = ocfs2_set_inode_size(handle, inode, di_bh, 175 new_i_size); 176 if (ret < 0) 177 mlog_errno(ret); 178 179 ocfs2_commit_trans(handle); 180 out: 181 return ret; 182 } 183 184 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 185 struct inode *inode, 186 struct buffer_head *fe_bh, 187 u64 new_i_size) 188 { 189 int status; 190 struct ocfs2_journal_handle *handle; 191 192 mlog_entry_void(); 193 194 /* TODO: This needs to actually orphan the inode in this 195 * transaction. */ 196 197 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 198 if (IS_ERR(handle)) { 199 status = PTR_ERR(handle); 200 mlog_errno(status); 201 goto out; 202 } 203 204 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 205 if (status < 0) 206 mlog_errno(status); 207 208 ocfs2_commit_trans(handle); 209 out: 210 mlog_exit(status); 211 return status; 212 } 213 214 static int ocfs2_truncate_file(struct inode *inode, 215 struct buffer_head *di_bh, 216 u64 new_i_size) 217 { 218 int status = 0; 219 struct ocfs2_dinode *fe = NULL; 220 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 221 struct ocfs2_truncate_context *tc = NULL; 222 223 mlog_entry("(inode = %llu, new_i_size = %llu\n", 224 (unsigned long long)OCFS2_I(inode)->ip_blkno, 225 (unsigned long long)new_i_size); 226 227 truncate_inode_pages(inode->i_mapping, new_i_size); 228 229 fe = (struct ocfs2_dinode *) di_bh->b_data; 230 if (!OCFS2_IS_VALID_DINODE(fe)) { 231 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 232 status = -EIO; 233 goto bail; 234 } 235 236 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 237 "Inode %llu, inode i_size = %lld != di " 238 "i_size = %llu, i_flags = 0x%x\n", 239 (unsigned long long)OCFS2_I(inode)->ip_blkno, 240 i_size_read(inode), 241 (unsigned long long)le64_to_cpu(fe->i_size), 242 le32_to_cpu(fe->i_flags)); 243 244 if (new_i_size > le64_to_cpu(fe->i_size)) { 245 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 246 (unsigned long long)le64_to_cpu(fe->i_size), 247 (unsigned long long)new_i_size); 248 status = -EINVAL; 249 mlog_errno(status); 250 goto bail; 251 } 252 253 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 254 (unsigned long long)le64_to_cpu(fe->i_blkno), 255 (unsigned long long)le64_to_cpu(fe->i_size), 256 (unsigned long long)new_i_size); 257 258 /* lets handle the simple truncate cases before doing any more 259 * cluster locking. */ 260 if (new_i_size == le64_to_cpu(fe->i_size)) 261 goto bail; 262 263 /* This forces other nodes to sync and drop their pages. Do 264 * this even if we have a truncate without allocation change - 265 * ocfs2 cluster sizes can be much greater than page size, so 266 * we have to truncate them anyway. */ 267 status = ocfs2_data_lock(inode, 1); 268 if (status < 0) { 269 mlog_errno(status); 270 goto bail; 271 } 272 ocfs2_data_unlock(inode, 1); 273 274 if (le32_to_cpu(fe->i_clusters) == 275 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 276 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 277 fe->i_clusters); 278 /* No allocation change is required, so lets fast path 279 * this truncate. */ 280 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 281 if (status < 0) 282 mlog_errno(status); 283 goto bail; 284 } 285 286 /* alright, we're going to need to do a full blown alloc size 287 * change. Orphan the inode so that recovery can complete the 288 * truncate if necessary. This does the task of marking 289 * i_size. */ 290 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 291 if (status < 0) { 292 mlog_errno(status); 293 goto bail; 294 } 295 296 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 297 if (status < 0) { 298 mlog_errno(status); 299 goto bail; 300 } 301 302 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 303 if (status < 0) { 304 mlog_errno(status); 305 goto bail; 306 } 307 308 /* TODO: orphan dir cleanup here. */ 309 bail: 310 311 mlog_exit(status); 312 return status; 313 } 314 315 /* 316 * extend allocation only here. 317 * we'll update all the disk stuff, and oip->alloc_size 318 * 319 * expect stuff to be locked, a transaction started and enough data / 320 * metadata reservations in the contexts. 321 * 322 * Will return -EAGAIN, and a reason if a restart is needed. 323 * If passed in, *reason will always be set, even in error. 324 */ 325 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 326 struct inode *inode, 327 u32 clusters_to_add, 328 struct buffer_head *fe_bh, 329 struct ocfs2_journal_handle *handle, 330 struct ocfs2_alloc_context *data_ac, 331 struct ocfs2_alloc_context *meta_ac, 332 enum ocfs2_alloc_restarted *reason_ret) 333 { 334 int status = 0; 335 int free_extents; 336 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 337 enum ocfs2_alloc_restarted reason = RESTART_NONE; 338 u32 bit_off, num_bits; 339 u64 block; 340 341 BUG_ON(!clusters_to_add); 342 343 free_extents = ocfs2_num_free_extents(osb, inode, fe); 344 if (free_extents < 0) { 345 status = free_extents; 346 mlog_errno(status); 347 goto leave; 348 } 349 350 /* there are two cases which could cause us to EAGAIN in the 351 * we-need-more-metadata case: 352 * 1) we haven't reserved *any* 353 * 2) we are so fragmented, we've needed to add metadata too 354 * many times. */ 355 if (!free_extents && !meta_ac) { 356 mlog(0, "we haven't reserved any metadata!\n"); 357 status = -EAGAIN; 358 reason = RESTART_META; 359 goto leave; 360 } else if ((!free_extents) 361 && (ocfs2_alloc_context_bits_left(meta_ac) 362 < ocfs2_extend_meta_needed(fe))) { 363 mlog(0, "filesystem is really fragmented...\n"); 364 status = -EAGAIN; 365 reason = RESTART_META; 366 goto leave; 367 } 368 369 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 370 &bit_off, &num_bits); 371 if (status < 0) { 372 if (status != -ENOSPC) 373 mlog_errno(status); 374 goto leave; 375 } 376 377 BUG_ON(num_bits > clusters_to_add); 378 379 /* reserve our write early -- insert_extent may update the inode */ 380 status = ocfs2_journal_access(handle, inode, fe_bh, 381 OCFS2_JOURNAL_ACCESS_WRITE); 382 if (status < 0) { 383 mlog_errno(status); 384 goto leave; 385 } 386 387 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 388 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 389 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 390 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 391 num_bits, meta_ac); 392 if (status < 0) { 393 mlog_errno(status); 394 goto leave; 395 } 396 397 le32_add_cpu(&fe->i_clusters, num_bits); 398 spin_lock(&OCFS2_I(inode)->ip_lock); 399 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 400 spin_unlock(&OCFS2_I(inode)->ip_lock); 401 402 status = ocfs2_journal_dirty(handle, fe_bh); 403 if (status < 0) { 404 mlog_errno(status); 405 goto leave; 406 } 407 408 clusters_to_add -= num_bits; 409 410 if (clusters_to_add) { 411 mlog(0, "need to alloc once more, clusters = %u, wanted = " 412 "%u\n", fe->i_clusters, clusters_to_add); 413 status = -EAGAIN; 414 reason = RESTART_TRANS; 415 } 416 417 leave: 418 mlog_exit(status); 419 if (reason_ret) 420 *reason_ret = reason; 421 return status; 422 } 423 424 static int ocfs2_extend_allocation(struct inode *inode, 425 u32 clusters_to_add) 426 { 427 int status = 0; 428 int restart_func = 0; 429 int drop_alloc_sem = 0; 430 int credits, num_free_extents; 431 u32 prev_clusters; 432 struct buffer_head *bh = NULL; 433 struct ocfs2_dinode *fe = NULL; 434 struct ocfs2_journal_handle *handle = NULL; 435 struct ocfs2_alloc_context *data_ac = NULL; 436 struct ocfs2_alloc_context *meta_ac = NULL; 437 enum ocfs2_alloc_restarted why; 438 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 439 440 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 441 442 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 443 OCFS2_BH_CACHED, inode); 444 if (status < 0) { 445 mlog_errno(status); 446 goto leave; 447 } 448 449 fe = (struct ocfs2_dinode *) bh->b_data; 450 if (!OCFS2_IS_VALID_DINODE(fe)) { 451 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 452 status = -EIO; 453 goto leave; 454 } 455 456 restart_all: 457 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 458 459 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " 460 "clusters_to_add = %u\n", 461 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 462 fe->i_clusters, clusters_to_add); 463 464 handle = ocfs2_alloc_handle(osb); 465 if (handle == NULL) { 466 status = -ENOMEM; 467 mlog_errno(status); 468 goto leave; 469 } 470 471 num_free_extents = ocfs2_num_free_extents(osb, 472 inode, 473 fe); 474 if (num_free_extents < 0) { 475 status = num_free_extents; 476 mlog_errno(status); 477 goto leave; 478 } 479 480 if (!num_free_extents) { 481 status = ocfs2_reserve_new_metadata(osb, 482 handle, 483 fe, 484 &meta_ac); 485 if (status < 0) { 486 if (status != -ENOSPC) 487 mlog_errno(status); 488 goto leave; 489 } 490 } 491 492 status = ocfs2_reserve_clusters(osb, 493 handle, 494 clusters_to_add, 495 &data_ac); 496 if (status < 0) { 497 if (status != -ENOSPC) 498 mlog_errno(status); 499 goto leave; 500 } 501 502 /* blocks peope in read/write from reading our allocation 503 * until we're done changing it. We depend on i_mutex to block 504 * other extend/truncate calls while we're here. Ordering wrt 505 * start_trans is important here -- always do it before! */ 506 down_write(&OCFS2_I(inode)->ip_alloc_sem); 507 drop_alloc_sem = 1; 508 509 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 510 handle = ocfs2_start_trans(osb, handle, credits); 511 if (IS_ERR(handle)) { 512 status = PTR_ERR(handle); 513 handle = NULL; 514 mlog_errno(status); 515 goto leave; 516 } 517 518 restarted_transaction: 519 /* reserve a write to the file entry early on - that we if we 520 * run out of credits in the allocation path, we can still 521 * update i_size. */ 522 status = ocfs2_journal_access(handle, inode, bh, 523 OCFS2_JOURNAL_ACCESS_WRITE); 524 if (status < 0) { 525 mlog_errno(status); 526 goto leave; 527 } 528 529 prev_clusters = OCFS2_I(inode)->ip_clusters; 530 531 status = ocfs2_do_extend_allocation(osb, 532 inode, 533 clusters_to_add, 534 bh, 535 handle, 536 data_ac, 537 meta_ac, 538 &why); 539 if ((status < 0) && (status != -EAGAIN)) { 540 if (status != -ENOSPC) 541 mlog_errno(status); 542 goto leave; 543 } 544 545 status = ocfs2_journal_dirty(handle, bh); 546 if (status < 0) { 547 mlog_errno(status); 548 goto leave; 549 } 550 551 spin_lock(&OCFS2_I(inode)->ip_lock); 552 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 553 spin_unlock(&OCFS2_I(inode)->ip_lock); 554 555 if (why != RESTART_NONE && clusters_to_add) { 556 if (why == RESTART_META) { 557 mlog(0, "restarting function.\n"); 558 restart_func = 1; 559 } else { 560 BUG_ON(why != RESTART_TRANS); 561 562 mlog(0, "restarting transaction.\n"); 563 /* TODO: This can be more intelligent. */ 564 credits = ocfs2_calc_extend_credits(osb->sb, 565 fe, 566 clusters_to_add); 567 status = ocfs2_extend_trans(handle, credits); 568 if (status < 0) { 569 /* handle still has to be committed at 570 * this point. */ 571 status = -ENOMEM; 572 mlog_errno(status); 573 goto leave; 574 } 575 goto restarted_transaction; 576 } 577 } 578 579 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 580 fe->i_clusters, (unsigned long long)fe->i_size); 581 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 582 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 583 584 leave: 585 if (drop_alloc_sem) { 586 up_write(&OCFS2_I(inode)->ip_alloc_sem); 587 drop_alloc_sem = 0; 588 } 589 if (handle) { 590 ocfs2_commit_trans(handle); 591 handle = NULL; 592 } 593 if (data_ac) { 594 ocfs2_free_alloc_context(data_ac); 595 data_ac = NULL; 596 } 597 if (meta_ac) { 598 ocfs2_free_alloc_context(meta_ac); 599 meta_ac = NULL; 600 } 601 if ((!status) && restart_func) { 602 restart_func = 0; 603 goto restart_all; 604 } 605 if (bh) { 606 brelse(bh); 607 bh = NULL; 608 } 609 610 mlog_exit(status); 611 return status; 612 } 613 614 /* Some parts of this taken from generic_cont_expand, which turned out 615 * to be too fragile to do exactly what we need without us having to 616 * worry about recursive locking in ->prepare_write() and 617 * ->commit_write(). */ 618 static int ocfs2_write_zero_page(struct inode *inode, 619 u64 size) 620 { 621 struct address_space *mapping = inode->i_mapping; 622 struct page *page; 623 unsigned long index; 624 unsigned int offset; 625 struct ocfs2_journal_handle *handle = NULL; 626 int ret; 627 628 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 629 /* ugh. in prepare/commit_write, if from==to==start of block, we 630 ** skip the prepare. make sure we never send an offset for the start 631 ** of a block 632 */ 633 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 634 offset++; 635 } 636 index = size >> PAGE_CACHE_SHIFT; 637 638 page = grab_cache_page(mapping, index); 639 if (!page) { 640 ret = -ENOMEM; 641 mlog_errno(ret); 642 goto out; 643 } 644 645 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 646 if (ret < 0) { 647 mlog_errno(ret); 648 goto out_unlock; 649 } 650 651 if (ocfs2_should_order_data(inode)) { 652 handle = ocfs2_start_walk_page_trans(inode, page, offset, 653 offset); 654 if (IS_ERR(handle)) { 655 ret = PTR_ERR(handle); 656 handle = NULL; 657 goto out_unlock; 658 } 659 } 660 661 /* must not update i_size! */ 662 ret = block_commit_write(page, offset, offset); 663 if (ret < 0) 664 mlog_errno(ret); 665 else 666 ret = 0; 667 668 if (handle) 669 ocfs2_commit_trans(handle); 670 out_unlock: 671 unlock_page(page); 672 page_cache_release(page); 673 out: 674 return ret; 675 } 676 677 static int ocfs2_zero_extend(struct inode *inode, 678 u64 zero_to_size) 679 { 680 int ret = 0; 681 u64 start_off; 682 struct super_block *sb = inode->i_sb; 683 684 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 685 while (start_off < zero_to_size) { 686 ret = ocfs2_write_zero_page(inode, start_off); 687 if (ret < 0) { 688 mlog_errno(ret); 689 goto out; 690 } 691 692 start_off += sb->s_blocksize; 693 } 694 695 out: 696 return ret; 697 } 698 699 /* 700 * A tail_to_skip value > 0 indicates that we're being called from 701 * ocfs2_file_aio_write(). This has the following implications: 702 * 703 * - we don't want to update i_size 704 * - di_bh will be NULL, which is fine because it's only used in the 705 * case where we want to update i_size. 706 * - ocfs2_zero_extend() will then only be filling the hole created 707 * between i_size and the start of the write. 708 */ 709 static int ocfs2_extend_file(struct inode *inode, 710 struct buffer_head *di_bh, 711 u64 new_i_size, 712 size_t tail_to_skip) 713 { 714 int ret = 0; 715 u32 clusters_to_add; 716 717 BUG_ON(!tail_to_skip && !di_bh); 718 719 /* setattr sometimes calls us like this. */ 720 if (new_i_size == 0) 721 goto out; 722 723 if (i_size_read(inode) == new_i_size) 724 goto out; 725 BUG_ON(new_i_size < i_size_read(inode)); 726 727 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 728 OCFS2_I(inode)->ip_clusters; 729 730 if (clusters_to_add) { 731 /* 732 * protect the pages that ocfs2_zero_extend is going to 733 * be pulling into the page cache.. we do this before the 734 * metadata extend so that we don't get into the situation 735 * where we've extended the metadata but can't get the data 736 * lock to zero. 737 */ 738 ret = ocfs2_data_lock(inode, 1); 739 if (ret < 0) { 740 mlog_errno(ret); 741 goto out; 742 } 743 744 ret = ocfs2_extend_allocation(inode, clusters_to_add); 745 if (ret < 0) { 746 mlog_errno(ret); 747 goto out_unlock; 748 } 749 750 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 751 if (ret < 0) { 752 mlog_errno(ret); 753 goto out_unlock; 754 } 755 } 756 757 if (!tail_to_skip) { 758 /* We're being called from ocfs2_setattr() which wants 759 * us to update i_size */ 760 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 761 if (ret < 0) 762 mlog_errno(ret); 763 } 764 765 out_unlock: 766 if (clusters_to_add) /* this is the only case in which we lock */ 767 ocfs2_data_unlock(inode, 1); 768 769 out: 770 return ret; 771 } 772 773 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 774 { 775 int status = 0, size_change; 776 struct inode *inode = dentry->d_inode; 777 struct super_block *sb = inode->i_sb; 778 struct ocfs2_super *osb = OCFS2_SB(sb); 779 struct buffer_head *bh = NULL; 780 struct ocfs2_journal_handle *handle = NULL; 781 782 mlog_entry("(0x%p, '%.*s')\n", dentry, 783 dentry->d_name.len, dentry->d_name.name); 784 785 if (attr->ia_valid & ATTR_MODE) 786 mlog(0, "mode change: %d\n", attr->ia_mode); 787 if (attr->ia_valid & ATTR_UID) 788 mlog(0, "uid change: %d\n", attr->ia_uid); 789 if (attr->ia_valid & ATTR_GID) 790 mlog(0, "gid change: %d\n", attr->ia_gid); 791 if (attr->ia_valid & ATTR_SIZE) 792 mlog(0, "size change...\n"); 793 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 794 mlog(0, "time change...\n"); 795 796 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 797 | ATTR_GID | ATTR_UID | ATTR_MODE) 798 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 799 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 800 return 0; 801 } 802 803 status = inode_change_ok(inode, attr); 804 if (status) 805 return status; 806 807 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 808 if (size_change) { 809 status = ocfs2_rw_lock(inode, 1); 810 if (status < 0) { 811 mlog_errno(status); 812 goto bail; 813 } 814 } 815 816 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 817 if (status < 0) { 818 if (status != -ENOENT) 819 mlog_errno(status); 820 goto bail_unlock_rw; 821 } 822 823 if (size_change && attr->ia_size != i_size_read(inode)) { 824 if (i_size_read(inode) > attr->ia_size) 825 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 826 else 827 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 828 if (status < 0) { 829 if (status != -ENOSPC) 830 mlog_errno(status); 831 status = -ENOSPC; 832 goto bail_unlock; 833 } 834 } 835 836 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 837 if (IS_ERR(handle)) { 838 status = PTR_ERR(handle); 839 mlog_errno(status); 840 goto bail_unlock; 841 } 842 843 status = inode_setattr(inode, attr); 844 if (status < 0) { 845 mlog_errno(status); 846 goto bail_commit; 847 } 848 849 status = ocfs2_mark_inode_dirty(handle, inode, bh); 850 if (status < 0) 851 mlog_errno(status); 852 853 bail_commit: 854 ocfs2_commit_trans(handle); 855 bail_unlock: 856 ocfs2_meta_unlock(inode, 1); 857 bail_unlock_rw: 858 if (size_change) 859 ocfs2_rw_unlock(inode, 1); 860 bail: 861 if (bh) 862 brelse(bh); 863 864 mlog_exit(status); 865 return status; 866 } 867 868 int ocfs2_getattr(struct vfsmount *mnt, 869 struct dentry *dentry, 870 struct kstat *stat) 871 { 872 struct inode *inode = dentry->d_inode; 873 struct super_block *sb = dentry->d_inode->i_sb; 874 struct ocfs2_super *osb = sb->s_fs_info; 875 int err; 876 877 mlog_entry_void(); 878 879 err = ocfs2_inode_revalidate(dentry); 880 if (err) { 881 if (err != -ENOENT) 882 mlog_errno(err); 883 goto bail; 884 } 885 886 generic_fillattr(inode, stat); 887 888 /* We set the blksize from the cluster size for performance */ 889 stat->blksize = osb->s_clustersize; 890 891 bail: 892 mlog_exit(err); 893 894 return err; 895 } 896 897 static int ocfs2_write_remove_suid(struct inode *inode) 898 { 899 int ret; 900 struct buffer_head *bh = NULL; 901 struct ocfs2_inode_info *oi = OCFS2_I(inode); 902 struct ocfs2_journal_handle *handle; 903 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 904 struct ocfs2_dinode *di; 905 906 mlog_entry("(Inode %llu, mode 0%o)\n", 907 (unsigned long long)oi->ip_blkno, inode->i_mode); 908 909 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 910 if (handle == NULL) { 911 ret = -ENOMEM; 912 mlog_errno(ret); 913 goto out; 914 } 915 916 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 917 if (ret < 0) { 918 mlog_errno(ret); 919 goto out_trans; 920 } 921 922 ret = ocfs2_journal_access(handle, inode, bh, 923 OCFS2_JOURNAL_ACCESS_WRITE); 924 if (ret < 0) { 925 mlog_errno(ret); 926 goto out_bh; 927 } 928 929 inode->i_mode &= ~S_ISUID; 930 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 931 inode->i_mode &= ~S_ISGID; 932 933 di = (struct ocfs2_dinode *) bh->b_data; 934 di->i_mode = cpu_to_le16(inode->i_mode); 935 936 ret = ocfs2_journal_dirty(handle, bh); 937 if (ret < 0) 938 mlog_errno(ret); 939 out_bh: 940 brelse(bh); 941 out_trans: 942 ocfs2_commit_trans(handle); 943 out: 944 mlog_exit(ret); 945 return ret; 946 } 947 948 static inline int ocfs2_write_should_remove_suid(struct inode *inode) 949 { 950 mode_t mode = inode->i_mode; 951 952 if (!capable(CAP_FSETID)) { 953 if (unlikely(mode & S_ISUID)) 954 return 1; 955 956 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 957 return 1; 958 } 959 return 0; 960 } 961 962 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 963 const char __user *buf, 964 size_t count, 965 loff_t pos) 966 { 967 struct iovec local_iov = { .iov_base = (void __user *)buf, 968 .iov_len = count }; 969 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; 970 u32 clusters; 971 struct file *filp = iocb->ki_filp; 972 struct inode *inode = filp->f_dentry->d_inode; 973 loff_t newsize, saved_pos; 974 975 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 976 (unsigned int)count, 977 filp->f_dentry->d_name.len, 978 filp->f_dentry->d_name.name); 979 980 /* happy write of zero bytes */ 981 if (count == 0) 982 return 0; 983 984 if (!inode) { 985 mlog(0, "bad inode\n"); 986 return -EIO; 987 } 988 989 mutex_lock(&inode->i_mutex); 990 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 991 if (filp->f_flags & O_DIRECT) { 992 have_alloc_sem = 1; 993 down_read(&inode->i_alloc_sem); 994 } 995 996 /* concurrent O_DIRECT writes are allowed */ 997 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 998 ret = ocfs2_rw_lock(inode, rw_level); 999 if (ret < 0) { 1000 rw_level = -1; 1001 mlog_errno(ret); 1002 goto out; 1003 } 1004 1005 /* 1006 * We sample i_size under a read level meta lock to see if our write 1007 * is extending the file, if it is we back off and get a write level 1008 * meta lock. 1009 */ 1010 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; 1011 for(;;) { 1012 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); 1013 if (ret < 0) { 1014 meta_level = -1; 1015 mlog_errno(ret); 1016 goto out; 1017 } 1018 1019 /* Clear suid / sgid if necessary. We do this here 1020 * instead of later in the write path because 1021 * remove_suid() calls ->setattr without any hint that 1022 * we may have already done our cluster locking. Since 1023 * ocfs2_setattr() *must* take cluster locks to 1024 * proceeed, this will lead us to recursively lock the 1025 * inode. There's also the dinode i_size state which 1026 * can be lost via setattr during extending writes (we 1027 * set inode->i_size at the end of a write. */ 1028 if (ocfs2_write_should_remove_suid(inode)) { 1029 if (meta_level == 0) { 1030 ocfs2_meta_unlock(inode, meta_level); 1031 meta_level = 1; 1032 continue; 1033 } 1034 1035 ret = ocfs2_write_remove_suid(inode); 1036 if (ret < 0) { 1037 mlog_errno(ret); 1038 goto out; 1039 } 1040 } 1041 1042 /* work on a copy of ppos until we're sure that we won't have 1043 * to recalculate it due to relocking. */ 1044 if (filp->f_flags & O_APPEND) { 1045 saved_pos = i_size_read(inode); 1046 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1047 } else { 1048 saved_pos = iocb->ki_pos; 1049 } 1050 newsize = count + saved_pos; 1051 1052 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1053 (long long) saved_pos, (long long) newsize, 1054 (long long) i_size_read(inode)); 1055 1056 /* No need for a higher level metadata lock if we're 1057 * never going past i_size. */ 1058 if (newsize <= i_size_read(inode)) 1059 break; 1060 1061 if (meta_level == 0) { 1062 ocfs2_meta_unlock(inode, meta_level); 1063 meta_level = 1; 1064 continue; 1065 } 1066 1067 spin_lock(&OCFS2_I(inode)->ip_lock); 1068 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1069 OCFS2_I(inode)->ip_clusters; 1070 spin_unlock(&OCFS2_I(inode)->ip_lock); 1071 1072 mlog(0, "Writing at EOF, may need more allocation: " 1073 "i_size = %lld, newsize = %lld, need %u clusters\n", 1074 (long long) i_size_read(inode), (long long) newsize, 1075 clusters); 1076 1077 /* We only want to continue the rest of this loop if 1078 * our extend will actually require more 1079 * allocation. */ 1080 if (!clusters) 1081 break; 1082 1083 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1084 if (ret < 0) { 1085 if (ret != -ENOSPC) 1086 mlog_errno(ret); 1087 goto out; 1088 } 1089 break; 1090 } 1091 1092 /* ok, we're done with i_size and alloc work */ 1093 iocb->ki_pos = saved_pos; 1094 ocfs2_meta_unlock(inode, meta_level); 1095 meta_level = -1; 1096 1097 /* communicate with ocfs2_dio_end_io */ 1098 ocfs2_iocb_set_rw_locked(iocb); 1099 1100 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1101 1102 /* buffered aio wouldn't have proper lock coverage today */ 1103 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1104 1105 /* 1106 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1107 * function pointer which is called when o_direct io completes so that 1108 * it can unlock our rw lock. (it's the clustered equivalent of 1109 * i_alloc_sem; protects truncate from racing with pending ios). 1110 * Unfortunately there are error cases which call end_io and others 1111 * that don't. so we don't have to unlock the rw_lock if either an 1112 * async dio is going to do it in the future or an end_io after an 1113 * error has already done it. 1114 */ 1115 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1116 rw_level = -1; 1117 have_alloc_sem = 0; 1118 } 1119 1120 out: 1121 if (meta_level != -1) 1122 ocfs2_meta_unlock(inode, meta_level); 1123 if (have_alloc_sem) 1124 up_read(&inode->i_alloc_sem); 1125 if (rw_level != -1) 1126 ocfs2_rw_unlock(inode, rw_level); 1127 mutex_unlock(&inode->i_mutex); 1128 1129 mlog_exit(ret); 1130 return ret; 1131 } 1132 1133 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1134 char __user *buf, 1135 size_t count, 1136 loff_t pos) 1137 { 1138 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1139 struct file *filp = iocb->ki_filp; 1140 struct inode *inode = filp->f_dentry->d_inode; 1141 1142 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1143 (unsigned int)count, 1144 filp->f_dentry->d_name.len, 1145 filp->f_dentry->d_name.name); 1146 1147 if (!inode) { 1148 ret = -EINVAL; 1149 mlog_errno(ret); 1150 goto bail; 1151 } 1152 1153 /* 1154 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1155 * need locks to protect pending reads from racing with truncate. 1156 */ 1157 if (filp->f_flags & O_DIRECT) { 1158 down_read(&inode->i_alloc_sem); 1159 have_alloc_sem = 1; 1160 1161 ret = ocfs2_rw_lock(inode, 0); 1162 if (ret < 0) { 1163 mlog_errno(ret); 1164 goto bail; 1165 } 1166 rw_level = 0; 1167 /* communicate with ocfs2_dio_end_io */ 1168 ocfs2_iocb_set_rw_locked(iocb); 1169 } 1170 1171 /* 1172 * We're fine letting folks race truncates and extending 1173 * writes with read across the cluster, just like they can 1174 * locally. Hence no rw_lock during read. 1175 * 1176 * Take and drop the meta data lock to update inode fields 1177 * like i_size. This allows the checks down below 1178 * generic_file_aio_read() a chance of actually working. 1179 */ 1180 ret = ocfs2_meta_lock(inode, NULL, NULL, 0); 1181 if (ret < 0) { 1182 mlog_errno(ret); 1183 goto bail; 1184 } 1185 ocfs2_meta_unlock(inode, 0); 1186 1187 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); 1188 if (ret == -EINVAL) 1189 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1190 1191 /* buffered aio wouldn't have proper lock coverage today */ 1192 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1193 1194 /* see ocfs2_file_aio_write */ 1195 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1196 rw_level = -1; 1197 have_alloc_sem = 0; 1198 } 1199 1200 bail: 1201 if (have_alloc_sem) 1202 up_read(&inode->i_alloc_sem); 1203 if (rw_level != -1) 1204 ocfs2_rw_unlock(inode, rw_level); 1205 mlog_exit(ret); 1206 1207 return ret; 1208 } 1209 1210 struct inode_operations ocfs2_file_iops = { 1211 .setattr = ocfs2_setattr, 1212 .getattr = ocfs2_getattr, 1213 }; 1214 1215 struct inode_operations ocfs2_special_file_iops = { 1216 .setattr = ocfs2_setattr, 1217 .getattr = ocfs2_getattr, 1218 }; 1219 1220 const struct file_operations ocfs2_fops = { 1221 .read = do_sync_read, 1222 .write = do_sync_write, 1223 .sendfile = generic_file_sendfile, 1224 .mmap = ocfs2_mmap, 1225 .fsync = ocfs2_sync_file, 1226 .release = ocfs2_file_release, 1227 .open = ocfs2_file_open, 1228 .aio_read = ocfs2_file_aio_read, 1229 .aio_write = ocfs2_file_aio_write, 1230 }; 1231 1232 const struct file_operations ocfs2_dops = { 1233 .read = generic_read_dir, 1234 .readdir = ocfs2_readdir, 1235 .fsync = ocfs2_sync_file, 1236 }; 1237