1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 34 #define MLOG_MASK_PREFIX ML_INODE 35 #include <cluster/masklog.h> 36 37 #include "ocfs2.h" 38 39 #include "alloc.h" 40 #include "aops.h" 41 #include "dir.h" 42 #include "dlmglue.h" 43 #include "extent_map.h" 44 #include "file.h" 45 #include "sysfile.h" 46 #include "inode.h" 47 #include "ioctl.h" 48 #include "journal.h" 49 #include "mmap.h" 50 #include "suballoc.h" 51 #include "super.h" 52 53 #include "buffer_head_io.h" 54 55 static int ocfs2_sync_inode(struct inode *inode) 56 { 57 filemap_fdatawrite(inode->i_mapping); 58 return sync_mapping_buffers(inode->i_mapping); 59 } 60 61 static int ocfs2_file_open(struct inode *inode, struct file *file) 62 { 63 int status; 64 int mode = file->f_flags; 65 struct ocfs2_inode_info *oi = OCFS2_I(inode); 66 67 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 68 file->f_dentry->d_name.len, file->f_dentry->d_name.name); 69 70 spin_lock(&oi->ip_lock); 71 72 /* Check that the inode hasn't been wiped from disk by another 73 * node. If it hasn't then we're safe as long as we hold the 74 * spin lock until our increment of open count. */ 75 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 76 spin_unlock(&oi->ip_lock); 77 78 status = -ENOENT; 79 goto leave; 80 } 81 82 if (mode & O_DIRECT) 83 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 84 85 oi->ip_open_count++; 86 spin_unlock(&oi->ip_lock); 87 status = 0; 88 leave: 89 mlog_exit(status); 90 return status; 91 } 92 93 static int ocfs2_file_release(struct inode *inode, struct file *file) 94 { 95 struct ocfs2_inode_info *oi = OCFS2_I(inode); 96 97 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 98 file->f_dentry->d_name.len, 99 file->f_dentry->d_name.name); 100 101 spin_lock(&oi->ip_lock); 102 if (!--oi->ip_open_count) 103 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 104 spin_unlock(&oi->ip_lock); 105 106 mlog_exit(0); 107 108 return 0; 109 } 110 111 static int ocfs2_sync_file(struct file *file, 112 struct dentry *dentry, 113 int datasync) 114 { 115 int err = 0; 116 journal_t *journal; 117 struct inode *inode = dentry->d_inode; 118 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 119 120 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 121 dentry->d_name.len, dentry->d_name.name); 122 123 err = ocfs2_sync_inode(dentry->d_inode); 124 if (err) 125 goto bail; 126 127 journal = osb->journal->j_journal; 128 err = journal_force_commit(journal); 129 130 bail: 131 mlog_exit(err); 132 133 return (err < 0) ? -EIO : 0; 134 } 135 136 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, 137 struct inode *inode, 138 struct buffer_head *fe_bh, 139 u64 new_i_size) 140 { 141 int status; 142 143 mlog_entry_void(); 144 i_size_write(inode, new_i_size); 145 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 146 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 147 148 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 149 if (status < 0) { 150 mlog_errno(status); 151 goto bail; 152 } 153 154 bail: 155 mlog_exit(status); 156 return status; 157 } 158 159 static int ocfs2_simple_size_update(struct inode *inode, 160 struct buffer_head *di_bh, 161 u64 new_i_size) 162 { 163 int ret; 164 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 165 struct ocfs2_journal_handle *handle = NULL; 166 167 handle = ocfs2_start_trans(osb, NULL, 168 OCFS2_INODE_UPDATE_CREDITS); 169 if (handle == NULL) { 170 ret = -ENOMEM; 171 mlog_errno(ret); 172 goto out; 173 } 174 175 ret = ocfs2_set_inode_size(handle, inode, di_bh, 176 new_i_size); 177 if (ret < 0) 178 mlog_errno(ret); 179 180 ocfs2_commit_trans(handle); 181 out: 182 return ret; 183 } 184 185 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 186 struct inode *inode, 187 struct buffer_head *fe_bh, 188 u64 new_i_size) 189 { 190 int status; 191 struct ocfs2_journal_handle *handle; 192 193 mlog_entry_void(); 194 195 /* TODO: This needs to actually orphan the inode in this 196 * transaction. */ 197 198 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 199 if (IS_ERR(handle)) { 200 status = PTR_ERR(handle); 201 mlog_errno(status); 202 goto out; 203 } 204 205 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 206 if (status < 0) 207 mlog_errno(status); 208 209 ocfs2_commit_trans(handle); 210 out: 211 mlog_exit(status); 212 return status; 213 } 214 215 static int ocfs2_truncate_file(struct inode *inode, 216 struct buffer_head *di_bh, 217 u64 new_i_size) 218 { 219 int status = 0; 220 struct ocfs2_dinode *fe = NULL; 221 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 222 struct ocfs2_truncate_context *tc = NULL; 223 224 mlog_entry("(inode = %llu, new_i_size = %llu\n", 225 (unsigned long long)OCFS2_I(inode)->ip_blkno, 226 (unsigned long long)new_i_size); 227 228 truncate_inode_pages(inode->i_mapping, new_i_size); 229 230 fe = (struct ocfs2_dinode *) di_bh->b_data; 231 if (!OCFS2_IS_VALID_DINODE(fe)) { 232 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 233 status = -EIO; 234 goto bail; 235 } 236 237 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 238 "Inode %llu, inode i_size = %lld != di " 239 "i_size = %llu, i_flags = 0x%x\n", 240 (unsigned long long)OCFS2_I(inode)->ip_blkno, 241 i_size_read(inode), 242 (unsigned long long)le64_to_cpu(fe->i_size), 243 le32_to_cpu(fe->i_flags)); 244 245 if (new_i_size > le64_to_cpu(fe->i_size)) { 246 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 247 (unsigned long long)le64_to_cpu(fe->i_size), 248 (unsigned long long)new_i_size); 249 status = -EINVAL; 250 mlog_errno(status); 251 goto bail; 252 } 253 254 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 255 (unsigned long long)le64_to_cpu(fe->i_blkno), 256 (unsigned long long)le64_to_cpu(fe->i_size), 257 (unsigned long long)new_i_size); 258 259 /* lets handle the simple truncate cases before doing any more 260 * cluster locking. */ 261 if (new_i_size == le64_to_cpu(fe->i_size)) 262 goto bail; 263 264 /* This forces other nodes to sync and drop their pages. Do 265 * this even if we have a truncate without allocation change - 266 * ocfs2 cluster sizes can be much greater than page size, so 267 * we have to truncate them anyway. */ 268 status = ocfs2_data_lock(inode, 1); 269 if (status < 0) { 270 mlog_errno(status); 271 goto bail; 272 } 273 ocfs2_data_unlock(inode, 1); 274 275 if (le32_to_cpu(fe->i_clusters) == 276 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 277 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 278 fe->i_clusters); 279 /* No allocation change is required, so lets fast path 280 * this truncate. */ 281 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 282 if (status < 0) 283 mlog_errno(status); 284 goto bail; 285 } 286 287 /* alright, we're going to need to do a full blown alloc size 288 * change. Orphan the inode so that recovery can complete the 289 * truncate if necessary. This does the task of marking 290 * i_size. */ 291 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 292 if (status < 0) { 293 mlog_errno(status); 294 goto bail; 295 } 296 297 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 298 if (status < 0) { 299 mlog_errno(status); 300 goto bail; 301 } 302 303 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 304 if (status < 0) { 305 mlog_errno(status); 306 goto bail; 307 } 308 309 /* TODO: orphan dir cleanup here. */ 310 bail: 311 312 mlog_exit(status); 313 return status; 314 } 315 316 /* 317 * extend allocation only here. 318 * we'll update all the disk stuff, and oip->alloc_size 319 * 320 * expect stuff to be locked, a transaction started and enough data / 321 * metadata reservations in the contexts. 322 * 323 * Will return -EAGAIN, and a reason if a restart is needed. 324 * If passed in, *reason will always be set, even in error. 325 */ 326 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 327 struct inode *inode, 328 u32 clusters_to_add, 329 struct buffer_head *fe_bh, 330 struct ocfs2_journal_handle *handle, 331 struct ocfs2_alloc_context *data_ac, 332 struct ocfs2_alloc_context *meta_ac, 333 enum ocfs2_alloc_restarted *reason_ret) 334 { 335 int status = 0; 336 int free_extents; 337 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 338 enum ocfs2_alloc_restarted reason = RESTART_NONE; 339 u32 bit_off, num_bits; 340 u64 block; 341 342 BUG_ON(!clusters_to_add); 343 344 free_extents = ocfs2_num_free_extents(osb, inode, fe); 345 if (free_extents < 0) { 346 status = free_extents; 347 mlog_errno(status); 348 goto leave; 349 } 350 351 /* there are two cases which could cause us to EAGAIN in the 352 * we-need-more-metadata case: 353 * 1) we haven't reserved *any* 354 * 2) we are so fragmented, we've needed to add metadata too 355 * many times. */ 356 if (!free_extents && !meta_ac) { 357 mlog(0, "we haven't reserved any metadata!\n"); 358 status = -EAGAIN; 359 reason = RESTART_META; 360 goto leave; 361 } else if ((!free_extents) 362 && (ocfs2_alloc_context_bits_left(meta_ac) 363 < ocfs2_extend_meta_needed(fe))) { 364 mlog(0, "filesystem is really fragmented...\n"); 365 status = -EAGAIN; 366 reason = RESTART_META; 367 goto leave; 368 } 369 370 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 371 &bit_off, &num_bits); 372 if (status < 0) { 373 if (status != -ENOSPC) 374 mlog_errno(status); 375 goto leave; 376 } 377 378 BUG_ON(num_bits > clusters_to_add); 379 380 /* reserve our write early -- insert_extent may update the inode */ 381 status = ocfs2_journal_access(handle, inode, fe_bh, 382 OCFS2_JOURNAL_ACCESS_WRITE); 383 if (status < 0) { 384 mlog_errno(status); 385 goto leave; 386 } 387 388 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 389 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 390 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 391 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 392 num_bits, meta_ac); 393 if (status < 0) { 394 mlog_errno(status); 395 goto leave; 396 } 397 398 le32_add_cpu(&fe->i_clusters, num_bits); 399 spin_lock(&OCFS2_I(inode)->ip_lock); 400 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 401 spin_unlock(&OCFS2_I(inode)->ip_lock); 402 403 status = ocfs2_journal_dirty(handle, fe_bh); 404 if (status < 0) { 405 mlog_errno(status); 406 goto leave; 407 } 408 409 clusters_to_add -= num_bits; 410 411 if (clusters_to_add) { 412 mlog(0, "need to alloc once more, clusters = %u, wanted = " 413 "%u\n", fe->i_clusters, clusters_to_add); 414 status = -EAGAIN; 415 reason = RESTART_TRANS; 416 } 417 418 leave: 419 mlog_exit(status); 420 if (reason_ret) 421 *reason_ret = reason; 422 return status; 423 } 424 425 static int ocfs2_extend_allocation(struct inode *inode, 426 u32 clusters_to_add) 427 { 428 int status = 0; 429 int restart_func = 0; 430 int drop_alloc_sem = 0; 431 int credits, num_free_extents; 432 u32 prev_clusters; 433 struct buffer_head *bh = NULL; 434 struct ocfs2_dinode *fe = NULL; 435 struct ocfs2_journal_handle *handle = NULL; 436 struct ocfs2_alloc_context *data_ac = NULL; 437 struct ocfs2_alloc_context *meta_ac = NULL; 438 enum ocfs2_alloc_restarted why; 439 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 440 441 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 442 443 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 444 OCFS2_BH_CACHED, inode); 445 if (status < 0) { 446 mlog_errno(status); 447 goto leave; 448 } 449 450 fe = (struct ocfs2_dinode *) bh->b_data; 451 if (!OCFS2_IS_VALID_DINODE(fe)) { 452 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 453 status = -EIO; 454 goto leave; 455 } 456 457 restart_all: 458 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 459 460 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " 461 "clusters_to_add = %u\n", 462 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 463 fe->i_clusters, clusters_to_add); 464 465 handle = ocfs2_alloc_handle(osb); 466 if (handle == NULL) { 467 status = -ENOMEM; 468 mlog_errno(status); 469 goto leave; 470 } 471 472 num_free_extents = ocfs2_num_free_extents(osb, 473 inode, 474 fe); 475 if (num_free_extents < 0) { 476 status = num_free_extents; 477 mlog_errno(status); 478 goto leave; 479 } 480 481 if (!num_free_extents) { 482 status = ocfs2_reserve_new_metadata(osb, 483 handle, 484 fe, 485 &meta_ac); 486 if (status < 0) { 487 if (status != -ENOSPC) 488 mlog_errno(status); 489 goto leave; 490 } 491 } 492 493 status = ocfs2_reserve_clusters(osb, 494 handle, 495 clusters_to_add, 496 &data_ac); 497 if (status < 0) { 498 if (status != -ENOSPC) 499 mlog_errno(status); 500 goto leave; 501 } 502 503 /* blocks peope in read/write from reading our allocation 504 * until we're done changing it. We depend on i_mutex to block 505 * other extend/truncate calls while we're here. Ordering wrt 506 * start_trans is important here -- always do it before! */ 507 down_write(&OCFS2_I(inode)->ip_alloc_sem); 508 drop_alloc_sem = 1; 509 510 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 511 handle = ocfs2_start_trans(osb, handle, credits); 512 if (IS_ERR(handle)) { 513 status = PTR_ERR(handle); 514 handle = NULL; 515 mlog_errno(status); 516 goto leave; 517 } 518 519 restarted_transaction: 520 /* reserve a write to the file entry early on - that we if we 521 * run out of credits in the allocation path, we can still 522 * update i_size. */ 523 status = ocfs2_journal_access(handle, inode, bh, 524 OCFS2_JOURNAL_ACCESS_WRITE); 525 if (status < 0) { 526 mlog_errno(status); 527 goto leave; 528 } 529 530 prev_clusters = OCFS2_I(inode)->ip_clusters; 531 532 status = ocfs2_do_extend_allocation(osb, 533 inode, 534 clusters_to_add, 535 bh, 536 handle, 537 data_ac, 538 meta_ac, 539 &why); 540 if ((status < 0) && (status != -EAGAIN)) { 541 if (status != -ENOSPC) 542 mlog_errno(status); 543 goto leave; 544 } 545 546 status = ocfs2_journal_dirty(handle, bh); 547 if (status < 0) { 548 mlog_errno(status); 549 goto leave; 550 } 551 552 spin_lock(&OCFS2_I(inode)->ip_lock); 553 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 554 spin_unlock(&OCFS2_I(inode)->ip_lock); 555 556 if (why != RESTART_NONE && clusters_to_add) { 557 if (why == RESTART_META) { 558 mlog(0, "restarting function.\n"); 559 restart_func = 1; 560 } else { 561 BUG_ON(why != RESTART_TRANS); 562 563 mlog(0, "restarting transaction.\n"); 564 /* TODO: This can be more intelligent. */ 565 credits = ocfs2_calc_extend_credits(osb->sb, 566 fe, 567 clusters_to_add); 568 status = ocfs2_extend_trans(handle, credits); 569 if (status < 0) { 570 /* handle still has to be committed at 571 * this point. */ 572 status = -ENOMEM; 573 mlog_errno(status); 574 goto leave; 575 } 576 goto restarted_transaction; 577 } 578 } 579 580 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 581 fe->i_clusters, (unsigned long long)fe->i_size); 582 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 583 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 584 585 leave: 586 if (drop_alloc_sem) { 587 up_write(&OCFS2_I(inode)->ip_alloc_sem); 588 drop_alloc_sem = 0; 589 } 590 if (handle) { 591 ocfs2_commit_trans(handle); 592 handle = NULL; 593 } 594 if (data_ac) { 595 ocfs2_free_alloc_context(data_ac); 596 data_ac = NULL; 597 } 598 if (meta_ac) { 599 ocfs2_free_alloc_context(meta_ac); 600 meta_ac = NULL; 601 } 602 if ((!status) && restart_func) { 603 restart_func = 0; 604 goto restart_all; 605 } 606 if (bh) { 607 brelse(bh); 608 bh = NULL; 609 } 610 611 mlog_exit(status); 612 return status; 613 } 614 615 /* Some parts of this taken from generic_cont_expand, which turned out 616 * to be too fragile to do exactly what we need without us having to 617 * worry about recursive locking in ->prepare_write() and 618 * ->commit_write(). */ 619 static int ocfs2_write_zero_page(struct inode *inode, 620 u64 size) 621 { 622 struct address_space *mapping = inode->i_mapping; 623 struct page *page; 624 unsigned long index; 625 unsigned int offset; 626 struct ocfs2_journal_handle *handle = NULL; 627 int ret; 628 629 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 630 /* ugh. in prepare/commit_write, if from==to==start of block, we 631 ** skip the prepare. make sure we never send an offset for the start 632 ** of a block 633 */ 634 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 635 offset++; 636 } 637 index = size >> PAGE_CACHE_SHIFT; 638 639 page = grab_cache_page(mapping, index); 640 if (!page) { 641 ret = -ENOMEM; 642 mlog_errno(ret); 643 goto out; 644 } 645 646 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 647 if (ret < 0) { 648 mlog_errno(ret); 649 goto out_unlock; 650 } 651 652 if (ocfs2_should_order_data(inode)) { 653 handle = ocfs2_start_walk_page_trans(inode, page, offset, 654 offset); 655 if (IS_ERR(handle)) { 656 ret = PTR_ERR(handle); 657 handle = NULL; 658 goto out_unlock; 659 } 660 } 661 662 /* must not update i_size! */ 663 ret = block_commit_write(page, offset, offset); 664 if (ret < 0) 665 mlog_errno(ret); 666 else 667 ret = 0; 668 669 if (handle) 670 ocfs2_commit_trans(handle); 671 out_unlock: 672 unlock_page(page); 673 page_cache_release(page); 674 out: 675 return ret; 676 } 677 678 static int ocfs2_zero_extend(struct inode *inode, 679 u64 zero_to_size) 680 { 681 int ret = 0; 682 u64 start_off; 683 struct super_block *sb = inode->i_sb; 684 685 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 686 while (start_off < zero_to_size) { 687 ret = ocfs2_write_zero_page(inode, start_off); 688 if (ret < 0) { 689 mlog_errno(ret); 690 goto out; 691 } 692 693 start_off += sb->s_blocksize; 694 } 695 696 out: 697 return ret; 698 } 699 700 /* 701 * A tail_to_skip value > 0 indicates that we're being called from 702 * ocfs2_file_aio_write(). This has the following implications: 703 * 704 * - we don't want to update i_size 705 * - di_bh will be NULL, which is fine because it's only used in the 706 * case where we want to update i_size. 707 * - ocfs2_zero_extend() will then only be filling the hole created 708 * between i_size and the start of the write. 709 */ 710 static int ocfs2_extend_file(struct inode *inode, 711 struct buffer_head *di_bh, 712 u64 new_i_size, 713 size_t tail_to_skip) 714 { 715 int ret = 0; 716 u32 clusters_to_add; 717 718 BUG_ON(!tail_to_skip && !di_bh); 719 720 /* setattr sometimes calls us like this. */ 721 if (new_i_size == 0) 722 goto out; 723 724 if (i_size_read(inode) == new_i_size) 725 goto out; 726 BUG_ON(new_i_size < i_size_read(inode)); 727 728 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 729 OCFS2_I(inode)->ip_clusters; 730 731 if (clusters_to_add) { 732 /* 733 * protect the pages that ocfs2_zero_extend is going to 734 * be pulling into the page cache.. we do this before the 735 * metadata extend so that we don't get into the situation 736 * where we've extended the metadata but can't get the data 737 * lock to zero. 738 */ 739 ret = ocfs2_data_lock(inode, 1); 740 if (ret < 0) { 741 mlog_errno(ret); 742 goto out; 743 } 744 745 ret = ocfs2_extend_allocation(inode, clusters_to_add); 746 if (ret < 0) { 747 mlog_errno(ret); 748 goto out_unlock; 749 } 750 751 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 752 if (ret < 0) { 753 mlog_errno(ret); 754 goto out_unlock; 755 } 756 } 757 758 if (!tail_to_skip) { 759 /* We're being called from ocfs2_setattr() which wants 760 * us to update i_size */ 761 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 762 if (ret < 0) 763 mlog_errno(ret); 764 } 765 766 out_unlock: 767 if (clusters_to_add) /* this is the only case in which we lock */ 768 ocfs2_data_unlock(inode, 1); 769 770 out: 771 return ret; 772 } 773 774 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 775 { 776 int status = 0, size_change; 777 struct inode *inode = dentry->d_inode; 778 struct super_block *sb = inode->i_sb; 779 struct ocfs2_super *osb = OCFS2_SB(sb); 780 struct buffer_head *bh = NULL; 781 struct ocfs2_journal_handle *handle = NULL; 782 783 mlog_entry("(0x%p, '%.*s')\n", dentry, 784 dentry->d_name.len, dentry->d_name.name); 785 786 if (attr->ia_valid & ATTR_MODE) 787 mlog(0, "mode change: %d\n", attr->ia_mode); 788 if (attr->ia_valid & ATTR_UID) 789 mlog(0, "uid change: %d\n", attr->ia_uid); 790 if (attr->ia_valid & ATTR_GID) 791 mlog(0, "gid change: %d\n", attr->ia_gid); 792 if (attr->ia_valid & ATTR_SIZE) 793 mlog(0, "size change...\n"); 794 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 795 mlog(0, "time change...\n"); 796 797 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 798 | ATTR_GID | ATTR_UID | ATTR_MODE) 799 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 800 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 801 return 0; 802 } 803 804 status = inode_change_ok(inode, attr); 805 if (status) 806 return status; 807 808 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 809 if (size_change) { 810 status = ocfs2_rw_lock(inode, 1); 811 if (status < 0) { 812 mlog_errno(status); 813 goto bail; 814 } 815 } 816 817 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 818 if (status < 0) { 819 if (status != -ENOENT) 820 mlog_errno(status); 821 goto bail_unlock_rw; 822 } 823 824 if (size_change && attr->ia_size != i_size_read(inode)) { 825 if (i_size_read(inode) > attr->ia_size) 826 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 827 else 828 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 829 if (status < 0) { 830 if (status != -ENOSPC) 831 mlog_errno(status); 832 status = -ENOSPC; 833 goto bail_unlock; 834 } 835 } 836 837 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 838 if (IS_ERR(handle)) { 839 status = PTR_ERR(handle); 840 mlog_errno(status); 841 goto bail_unlock; 842 } 843 844 status = inode_setattr(inode, attr); 845 if (status < 0) { 846 mlog_errno(status); 847 goto bail_commit; 848 } 849 850 status = ocfs2_mark_inode_dirty(handle, inode, bh); 851 if (status < 0) 852 mlog_errno(status); 853 854 bail_commit: 855 ocfs2_commit_trans(handle); 856 bail_unlock: 857 ocfs2_meta_unlock(inode, 1); 858 bail_unlock_rw: 859 if (size_change) 860 ocfs2_rw_unlock(inode, 1); 861 bail: 862 if (bh) 863 brelse(bh); 864 865 mlog_exit(status); 866 return status; 867 } 868 869 int ocfs2_getattr(struct vfsmount *mnt, 870 struct dentry *dentry, 871 struct kstat *stat) 872 { 873 struct inode *inode = dentry->d_inode; 874 struct super_block *sb = dentry->d_inode->i_sb; 875 struct ocfs2_super *osb = sb->s_fs_info; 876 int err; 877 878 mlog_entry_void(); 879 880 err = ocfs2_inode_revalidate(dentry); 881 if (err) { 882 if (err != -ENOENT) 883 mlog_errno(err); 884 goto bail; 885 } 886 887 generic_fillattr(inode, stat); 888 889 /* We set the blksize from the cluster size for performance */ 890 stat->blksize = osb->s_clustersize; 891 892 bail: 893 mlog_exit(err); 894 895 return err; 896 } 897 898 static int ocfs2_write_remove_suid(struct inode *inode) 899 { 900 int ret; 901 struct buffer_head *bh = NULL; 902 struct ocfs2_inode_info *oi = OCFS2_I(inode); 903 struct ocfs2_journal_handle *handle; 904 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 905 struct ocfs2_dinode *di; 906 907 mlog_entry("(Inode %llu, mode 0%o)\n", 908 (unsigned long long)oi->ip_blkno, inode->i_mode); 909 910 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 911 if (handle == NULL) { 912 ret = -ENOMEM; 913 mlog_errno(ret); 914 goto out; 915 } 916 917 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 918 if (ret < 0) { 919 mlog_errno(ret); 920 goto out_trans; 921 } 922 923 ret = ocfs2_journal_access(handle, inode, bh, 924 OCFS2_JOURNAL_ACCESS_WRITE); 925 if (ret < 0) { 926 mlog_errno(ret); 927 goto out_bh; 928 } 929 930 inode->i_mode &= ~S_ISUID; 931 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 932 inode->i_mode &= ~S_ISGID; 933 934 di = (struct ocfs2_dinode *) bh->b_data; 935 di->i_mode = cpu_to_le16(inode->i_mode); 936 937 ret = ocfs2_journal_dirty(handle, bh); 938 if (ret < 0) 939 mlog_errno(ret); 940 out_bh: 941 brelse(bh); 942 out_trans: 943 ocfs2_commit_trans(handle); 944 out: 945 mlog_exit(ret); 946 return ret; 947 } 948 949 static inline int ocfs2_write_should_remove_suid(struct inode *inode) 950 { 951 mode_t mode = inode->i_mode; 952 953 if (!capable(CAP_FSETID)) { 954 if (unlikely(mode & S_ISUID)) 955 return 1; 956 957 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 958 return 1; 959 } 960 return 0; 961 } 962 963 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 964 const char __user *buf, 965 size_t count, 966 loff_t pos) 967 { 968 struct iovec local_iov = { .iov_base = (void __user *)buf, 969 .iov_len = count }; 970 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; 971 u32 clusters; 972 struct file *filp = iocb->ki_filp; 973 struct inode *inode = filp->f_dentry->d_inode; 974 loff_t newsize, saved_pos; 975 976 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 977 (unsigned int)count, 978 filp->f_dentry->d_name.len, 979 filp->f_dentry->d_name.name); 980 981 /* happy write of zero bytes */ 982 if (count == 0) 983 return 0; 984 985 if (!inode) { 986 mlog(0, "bad inode\n"); 987 return -EIO; 988 } 989 990 mutex_lock(&inode->i_mutex); 991 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 992 if (filp->f_flags & O_DIRECT) { 993 have_alloc_sem = 1; 994 down_read(&inode->i_alloc_sem); 995 } 996 997 /* concurrent O_DIRECT writes are allowed */ 998 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 999 ret = ocfs2_rw_lock(inode, rw_level); 1000 if (ret < 0) { 1001 rw_level = -1; 1002 mlog_errno(ret); 1003 goto out; 1004 } 1005 1006 /* 1007 * We sample i_size under a read level meta lock to see if our write 1008 * is extending the file, if it is we back off and get a write level 1009 * meta lock. 1010 */ 1011 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; 1012 for(;;) { 1013 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); 1014 if (ret < 0) { 1015 meta_level = -1; 1016 mlog_errno(ret); 1017 goto out; 1018 } 1019 1020 /* Clear suid / sgid if necessary. We do this here 1021 * instead of later in the write path because 1022 * remove_suid() calls ->setattr without any hint that 1023 * we may have already done our cluster locking. Since 1024 * ocfs2_setattr() *must* take cluster locks to 1025 * proceeed, this will lead us to recursively lock the 1026 * inode. There's also the dinode i_size state which 1027 * can be lost via setattr during extending writes (we 1028 * set inode->i_size at the end of a write. */ 1029 if (ocfs2_write_should_remove_suid(inode)) { 1030 if (meta_level == 0) { 1031 ocfs2_meta_unlock(inode, meta_level); 1032 meta_level = 1; 1033 continue; 1034 } 1035 1036 ret = ocfs2_write_remove_suid(inode); 1037 if (ret < 0) { 1038 mlog_errno(ret); 1039 goto out; 1040 } 1041 } 1042 1043 /* work on a copy of ppos until we're sure that we won't have 1044 * to recalculate it due to relocking. */ 1045 if (filp->f_flags & O_APPEND) { 1046 saved_pos = i_size_read(inode); 1047 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1048 } else { 1049 saved_pos = iocb->ki_pos; 1050 } 1051 newsize = count + saved_pos; 1052 1053 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1054 (long long) saved_pos, (long long) newsize, 1055 (long long) i_size_read(inode)); 1056 1057 /* No need for a higher level metadata lock if we're 1058 * never going past i_size. */ 1059 if (newsize <= i_size_read(inode)) 1060 break; 1061 1062 if (meta_level == 0) { 1063 ocfs2_meta_unlock(inode, meta_level); 1064 meta_level = 1; 1065 continue; 1066 } 1067 1068 spin_lock(&OCFS2_I(inode)->ip_lock); 1069 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1070 OCFS2_I(inode)->ip_clusters; 1071 spin_unlock(&OCFS2_I(inode)->ip_lock); 1072 1073 mlog(0, "Writing at EOF, may need more allocation: " 1074 "i_size = %lld, newsize = %lld, need %u clusters\n", 1075 (long long) i_size_read(inode), (long long) newsize, 1076 clusters); 1077 1078 /* We only want to continue the rest of this loop if 1079 * our extend will actually require more 1080 * allocation. */ 1081 if (!clusters) 1082 break; 1083 1084 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1085 if (ret < 0) { 1086 if (ret != -ENOSPC) 1087 mlog_errno(ret); 1088 goto out; 1089 } 1090 break; 1091 } 1092 1093 /* ok, we're done with i_size and alloc work */ 1094 iocb->ki_pos = saved_pos; 1095 ocfs2_meta_unlock(inode, meta_level); 1096 meta_level = -1; 1097 1098 /* communicate with ocfs2_dio_end_io */ 1099 ocfs2_iocb_set_rw_locked(iocb); 1100 1101 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1102 1103 /* buffered aio wouldn't have proper lock coverage today */ 1104 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1105 1106 /* 1107 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1108 * function pointer which is called when o_direct io completes so that 1109 * it can unlock our rw lock. (it's the clustered equivalent of 1110 * i_alloc_sem; protects truncate from racing with pending ios). 1111 * Unfortunately there are error cases which call end_io and others 1112 * that don't. so we don't have to unlock the rw_lock if either an 1113 * async dio is going to do it in the future or an end_io after an 1114 * error has already done it. 1115 */ 1116 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1117 rw_level = -1; 1118 have_alloc_sem = 0; 1119 } 1120 1121 out: 1122 if (meta_level != -1) 1123 ocfs2_meta_unlock(inode, meta_level); 1124 if (have_alloc_sem) 1125 up_read(&inode->i_alloc_sem); 1126 if (rw_level != -1) 1127 ocfs2_rw_unlock(inode, rw_level); 1128 mutex_unlock(&inode->i_mutex); 1129 1130 mlog_exit(ret); 1131 return ret; 1132 } 1133 1134 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1135 char __user *buf, 1136 size_t count, 1137 loff_t pos) 1138 { 1139 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1140 struct file *filp = iocb->ki_filp; 1141 struct inode *inode = filp->f_dentry->d_inode; 1142 1143 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1144 (unsigned int)count, 1145 filp->f_dentry->d_name.len, 1146 filp->f_dentry->d_name.name); 1147 1148 if (!inode) { 1149 ret = -EINVAL; 1150 mlog_errno(ret); 1151 goto bail; 1152 } 1153 1154 /* 1155 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1156 * need locks to protect pending reads from racing with truncate. 1157 */ 1158 if (filp->f_flags & O_DIRECT) { 1159 down_read(&inode->i_alloc_sem); 1160 have_alloc_sem = 1; 1161 1162 ret = ocfs2_rw_lock(inode, 0); 1163 if (ret < 0) { 1164 mlog_errno(ret); 1165 goto bail; 1166 } 1167 rw_level = 0; 1168 /* communicate with ocfs2_dio_end_io */ 1169 ocfs2_iocb_set_rw_locked(iocb); 1170 } 1171 1172 /* 1173 * We're fine letting folks race truncates and extending 1174 * writes with read across the cluster, just like they can 1175 * locally. Hence no rw_lock during read. 1176 * 1177 * Take and drop the meta data lock to update inode fields 1178 * like i_size. This allows the checks down below 1179 * generic_file_aio_read() a chance of actually working. 1180 */ 1181 ret = ocfs2_meta_lock(inode, NULL, NULL, 0); 1182 if (ret < 0) { 1183 mlog_errno(ret); 1184 goto bail; 1185 } 1186 ocfs2_meta_unlock(inode, 0); 1187 1188 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); 1189 if (ret == -EINVAL) 1190 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1191 1192 /* buffered aio wouldn't have proper lock coverage today */ 1193 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1194 1195 /* see ocfs2_file_aio_write */ 1196 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1197 rw_level = -1; 1198 have_alloc_sem = 0; 1199 } 1200 1201 bail: 1202 if (have_alloc_sem) 1203 up_read(&inode->i_alloc_sem); 1204 if (rw_level != -1) 1205 ocfs2_rw_unlock(inode, rw_level); 1206 mlog_exit(ret); 1207 1208 return ret; 1209 } 1210 1211 struct inode_operations ocfs2_file_iops = { 1212 .setattr = ocfs2_setattr, 1213 .getattr = ocfs2_getattr, 1214 }; 1215 1216 struct inode_operations ocfs2_special_file_iops = { 1217 .setattr = ocfs2_setattr, 1218 .getattr = ocfs2_getattr, 1219 }; 1220 1221 const struct file_operations ocfs2_fops = { 1222 .read = do_sync_read, 1223 .write = do_sync_write, 1224 .sendfile = generic_file_sendfile, 1225 .mmap = ocfs2_mmap, 1226 .fsync = ocfs2_sync_file, 1227 .release = ocfs2_file_release, 1228 .open = ocfs2_file_open, 1229 .aio_read = ocfs2_file_aio_read, 1230 .aio_write = ocfs2_file_aio_write, 1231 .ioctl = ocfs2_ioctl, 1232 }; 1233 1234 const struct file_operations ocfs2_dops = { 1235 .read = generic_read_dir, 1236 .readdir = ocfs2_readdir, 1237 .fsync = ocfs2_sync_file, 1238 .ioctl = ocfs2_ioctl, 1239 }; 1240