1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 34 #define MLOG_MASK_PREFIX ML_INODE 35 #include <cluster/masklog.h> 36 37 #include "ocfs2.h" 38 39 #include "alloc.h" 40 #include "aops.h" 41 #include "dir.h" 42 #include "dlmglue.h" 43 #include "extent_map.h" 44 #include "file.h" 45 #include "sysfile.h" 46 #include "inode.h" 47 #include "journal.h" 48 #include "mmap.h" 49 #include "suballoc.h" 50 #include "super.h" 51 52 #include "buffer_head_io.h" 53 54 static int ocfs2_sync_inode(struct inode *inode) 55 { 56 filemap_fdatawrite(inode->i_mapping); 57 return sync_mapping_buffers(inode->i_mapping); 58 } 59 60 static int ocfs2_file_open(struct inode *inode, struct file *file) 61 { 62 int status; 63 int mode = file->f_flags; 64 struct ocfs2_inode_info *oi = OCFS2_I(inode); 65 66 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 67 file->f_dentry->d_name.len, file->f_dentry->d_name.name); 68 69 spin_lock(&oi->ip_lock); 70 71 /* Check that the inode hasn't been wiped from disk by another 72 * node. If it hasn't then we're safe as long as we hold the 73 * spin lock until our increment of open count. */ 74 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 75 spin_unlock(&oi->ip_lock); 76 77 status = -ENOENT; 78 goto leave; 79 } 80 81 if (mode & O_DIRECT) 82 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 83 84 oi->ip_open_count++; 85 spin_unlock(&oi->ip_lock); 86 status = 0; 87 leave: 88 mlog_exit(status); 89 return status; 90 } 91 92 static int ocfs2_file_release(struct inode *inode, struct file *file) 93 { 94 struct ocfs2_inode_info *oi = OCFS2_I(inode); 95 96 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 97 file->f_dentry->d_name.len, 98 file->f_dentry->d_name.name); 99 100 spin_lock(&oi->ip_lock); 101 if (!--oi->ip_open_count) 102 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 103 spin_unlock(&oi->ip_lock); 104 105 mlog_exit(0); 106 107 return 0; 108 } 109 110 static int ocfs2_sync_file(struct file *file, 111 struct dentry *dentry, 112 int datasync) 113 { 114 int err = 0; 115 journal_t *journal; 116 struct inode *inode = dentry->d_inode; 117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 118 119 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 120 dentry->d_name.len, dentry->d_name.name); 121 122 err = ocfs2_sync_inode(dentry->d_inode); 123 if (err) 124 goto bail; 125 126 journal = osb->journal->j_journal; 127 err = journal_force_commit(journal); 128 129 bail: 130 mlog_exit(err); 131 132 return (err < 0) ? -EIO : 0; 133 } 134 135 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, 136 struct inode *inode, 137 struct buffer_head *fe_bh, 138 u64 new_i_size) 139 { 140 int status; 141 142 mlog_entry_void(); 143 i_size_write(inode, new_i_size); 144 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 145 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 146 147 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 148 if (status < 0) { 149 mlog_errno(status); 150 goto bail; 151 } 152 153 bail: 154 mlog_exit(status); 155 return status; 156 } 157 158 static int ocfs2_simple_size_update(struct inode *inode, 159 struct buffer_head *di_bh, 160 u64 new_i_size) 161 { 162 int ret; 163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 164 struct ocfs2_journal_handle *handle = NULL; 165 166 handle = ocfs2_start_trans(osb, NULL, 167 OCFS2_INODE_UPDATE_CREDITS); 168 if (handle == NULL) { 169 ret = -ENOMEM; 170 mlog_errno(ret); 171 goto out; 172 } 173 174 ret = ocfs2_set_inode_size(handle, inode, di_bh, 175 new_i_size); 176 if (ret < 0) 177 mlog_errno(ret); 178 179 ocfs2_commit_trans(handle); 180 out: 181 return ret; 182 } 183 184 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 185 struct inode *inode, 186 struct buffer_head *fe_bh, 187 u64 new_i_size) 188 { 189 int status; 190 struct ocfs2_journal_handle *handle; 191 192 mlog_entry_void(); 193 194 /* TODO: This needs to actually orphan the inode in this 195 * transaction. */ 196 197 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 198 if (IS_ERR(handle)) { 199 status = PTR_ERR(handle); 200 mlog_errno(status); 201 goto out; 202 } 203 204 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 205 if (status < 0) 206 mlog_errno(status); 207 208 ocfs2_commit_trans(handle); 209 out: 210 mlog_exit(status); 211 return status; 212 } 213 214 static int ocfs2_truncate_file(struct inode *inode, 215 struct buffer_head *di_bh, 216 u64 new_i_size) 217 { 218 int status = 0; 219 struct ocfs2_dinode *fe = NULL; 220 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 221 struct ocfs2_truncate_context *tc = NULL; 222 223 mlog_entry("(inode = %llu, new_i_size = %llu\n", 224 (unsigned long long)OCFS2_I(inode)->ip_blkno, 225 (unsigned long long)new_i_size); 226 227 truncate_inode_pages(inode->i_mapping, new_i_size); 228 229 fe = (struct ocfs2_dinode *) di_bh->b_data; 230 if (!OCFS2_IS_VALID_DINODE(fe)) { 231 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 232 status = -EIO; 233 goto bail; 234 } 235 236 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 237 "Inode %llu, inode i_size = %lld != di " 238 "i_size = %llu, i_flags = 0x%x\n", 239 (unsigned long long)OCFS2_I(inode)->ip_blkno, 240 i_size_read(inode), 241 (unsigned long long)le64_to_cpu(fe->i_size), 242 le32_to_cpu(fe->i_flags)); 243 244 if (new_i_size > le64_to_cpu(fe->i_size)) { 245 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 246 (unsigned long long)le64_to_cpu(fe->i_size), 247 (unsigned long long)new_i_size); 248 status = -EINVAL; 249 mlog_errno(status); 250 goto bail; 251 } 252 253 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 254 (unsigned long long)le64_to_cpu(fe->i_blkno), 255 (unsigned long long)le64_to_cpu(fe->i_size), 256 (unsigned long long)new_i_size); 257 258 /* lets handle the simple truncate cases before doing any more 259 * cluster locking. */ 260 if (new_i_size == le64_to_cpu(fe->i_size)) 261 goto bail; 262 263 /* This forces other nodes to sync and drop their pages. Do 264 * this even if we have a truncate without allocation change - 265 * ocfs2 cluster sizes can be much greater than page size, so 266 * we have to truncate them anyway. */ 267 status = ocfs2_data_lock(inode, 1); 268 if (status < 0) { 269 mlog_errno(status); 270 goto bail; 271 } 272 ocfs2_data_unlock(inode, 1); 273 274 if (le32_to_cpu(fe->i_clusters) == 275 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 276 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 277 fe->i_clusters); 278 /* No allocation change is required, so lets fast path 279 * this truncate. */ 280 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 281 if (status < 0) 282 mlog_errno(status); 283 goto bail; 284 } 285 286 /* alright, we're going to need to do a full blown alloc size 287 * change. Orphan the inode so that recovery can complete the 288 * truncate if necessary. This does the task of marking 289 * i_size. */ 290 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 291 if (status < 0) { 292 mlog_errno(status); 293 goto bail; 294 } 295 296 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 297 if (status < 0) { 298 mlog_errno(status); 299 goto bail; 300 } 301 302 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 303 if (status < 0) { 304 mlog_errno(status); 305 goto bail; 306 } 307 308 /* TODO: orphan dir cleanup here. */ 309 bail: 310 311 mlog_exit(status); 312 return status; 313 } 314 315 /* 316 * extend allocation only here. 317 * we'll update all the disk stuff, and oip->alloc_size 318 * 319 * expect stuff to be locked, a transaction started and enough data / 320 * metadata reservations in the contexts. 321 * 322 * Will return -EAGAIN, and a reason if a restart is needed. 323 * If passed in, *reason will always be set, even in error. 324 */ 325 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 326 struct inode *inode, 327 u32 clusters_to_add, 328 struct buffer_head *fe_bh, 329 struct ocfs2_journal_handle *handle, 330 struct ocfs2_alloc_context *data_ac, 331 struct ocfs2_alloc_context *meta_ac, 332 enum ocfs2_alloc_restarted *reason_ret) 333 { 334 int status = 0; 335 int free_extents; 336 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 337 enum ocfs2_alloc_restarted reason = RESTART_NONE; 338 u32 bit_off, num_bits; 339 u64 block; 340 341 BUG_ON(!clusters_to_add); 342 343 free_extents = ocfs2_num_free_extents(osb, inode, fe); 344 if (free_extents < 0) { 345 status = free_extents; 346 mlog_errno(status); 347 goto leave; 348 } 349 350 /* there are two cases which could cause us to EAGAIN in the 351 * we-need-more-metadata case: 352 * 1) we haven't reserved *any* 353 * 2) we are so fragmented, we've needed to add metadata too 354 * many times. */ 355 if (!free_extents && !meta_ac) { 356 mlog(0, "we haven't reserved any metadata!\n"); 357 status = -EAGAIN; 358 reason = RESTART_META; 359 goto leave; 360 } else if ((!free_extents) 361 && (ocfs2_alloc_context_bits_left(meta_ac) 362 < ocfs2_extend_meta_needed(fe))) { 363 mlog(0, "filesystem is really fragmented...\n"); 364 status = -EAGAIN; 365 reason = RESTART_META; 366 goto leave; 367 } 368 369 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 370 &bit_off, &num_bits); 371 if (status < 0) { 372 if (status != -ENOSPC) 373 mlog_errno(status); 374 goto leave; 375 } 376 377 BUG_ON(num_bits > clusters_to_add); 378 379 /* reserve our write early -- insert_extent may update the inode */ 380 status = ocfs2_journal_access(handle, inode, fe_bh, 381 OCFS2_JOURNAL_ACCESS_WRITE); 382 if (status < 0) { 383 mlog_errno(status); 384 goto leave; 385 } 386 387 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 388 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 389 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 390 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 391 num_bits, meta_ac); 392 if (status < 0) { 393 mlog_errno(status); 394 goto leave; 395 } 396 397 le32_add_cpu(&fe->i_clusters, num_bits); 398 spin_lock(&OCFS2_I(inode)->ip_lock); 399 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 400 spin_unlock(&OCFS2_I(inode)->ip_lock); 401 402 status = ocfs2_journal_dirty(handle, fe_bh); 403 if (status < 0) { 404 mlog_errno(status); 405 goto leave; 406 } 407 408 clusters_to_add -= num_bits; 409 410 if (clusters_to_add) { 411 mlog(0, "need to alloc once more, clusters = %u, wanted = " 412 "%u\n", fe->i_clusters, clusters_to_add); 413 status = -EAGAIN; 414 reason = RESTART_TRANS; 415 } 416 417 leave: 418 mlog_exit(status); 419 if (reason_ret) 420 *reason_ret = reason; 421 return status; 422 } 423 424 static int ocfs2_extend_allocation(struct inode *inode, 425 u32 clusters_to_add) 426 { 427 int status = 0; 428 int restart_func = 0; 429 int drop_alloc_sem = 0; 430 int credits, num_free_extents; 431 u32 prev_clusters; 432 struct buffer_head *bh = NULL; 433 struct ocfs2_dinode *fe = NULL; 434 struct ocfs2_journal_handle *handle = NULL; 435 struct ocfs2_alloc_context *data_ac = NULL; 436 struct ocfs2_alloc_context *meta_ac = NULL; 437 enum ocfs2_alloc_restarted why; 438 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 439 440 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 441 442 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 443 OCFS2_BH_CACHED, inode); 444 if (status < 0) { 445 mlog_errno(status); 446 goto leave; 447 } 448 449 fe = (struct ocfs2_dinode *) bh->b_data; 450 if (!OCFS2_IS_VALID_DINODE(fe)) { 451 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 452 status = -EIO; 453 goto leave; 454 } 455 456 restart_all: 457 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 458 459 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " 460 "clusters_to_add = %u\n", 461 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 462 fe->i_clusters, clusters_to_add); 463 464 handle = ocfs2_alloc_handle(osb); 465 if (handle == NULL) { 466 status = -ENOMEM; 467 mlog_errno(status); 468 goto leave; 469 } 470 471 num_free_extents = ocfs2_num_free_extents(osb, 472 inode, 473 fe); 474 if (num_free_extents < 0) { 475 status = num_free_extents; 476 mlog_errno(status); 477 goto leave; 478 } 479 480 if (!num_free_extents) { 481 status = ocfs2_reserve_new_metadata(osb, 482 handle, 483 fe, 484 &meta_ac); 485 if (status < 0) { 486 if (status != -ENOSPC) 487 mlog_errno(status); 488 goto leave; 489 } 490 } 491 492 status = ocfs2_reserve_clusters(osb, 493 handle, 494 clusters_to_add, 495 &data_ac); 496 if (status < 0) { 497 if (status != -ENOSPC) 498 mlog_errno(status); 499 goto leave; 500 } 501 502 /* blocks peope in read/write from reading our allocation 503 * until we're done changing it. We depend on i_mutex to block 504 * other extend/truncate calls while we're here. Ordering wrt 505 * start_trans is important here -- always do it before! */ 506 down_write(&OCFS2_I(inode)->ip_alloc_sem); 507 drop_alloc_sem = 1; 508 509 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 510 handle = ocfs2_start_trans(osb, handle, credits); 511 if (IS_ERR(handle)) { 512 status = PTR_ERR(handle); 513 handle = NULL; 514 mlog_errno(status); 515 goto leave; 516 } 517 518 restarted_transaction: 519 /* reserve a write to the file entry early on - that we if we 520 * run out of credits in the allocation path, we can still 521 * update i_size. */ 522 status = ocfs2_journal_access(handle, inode, bh, 523 OCFS2_JOURNAL_ACCESS_WRITE); 524 if (status < 0) { 525 mlog_errno(status); 526 goto leave; 527 } 528 529 prev_clusters = OCFS2_I(inode)->ip_clusters; 530 531 status = ocfs2_do_extend_allocation(osb, 532 inode, 533 clusters_to_add, 534 bh, 535 handle, 536 data_ac, 537 meta_ac, 538 &why); 539 if ((status < 0) && (status != -EAGAIN)) { 540 if (status != -ENOSPC) 541 mlog_errno(status); 542 goto leave; 543 } 544 545 status = ocfs2_journal_dirty(handle, bh); 546 if (status < 0) { 547 mlog_errno(status); 548 goto leave; 549 } 550 551 spin_lock(&OCFS2_I(inode)->ip_lock); 552 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 553 spin_unlock(&OCFS2_I(inode)->ip_lock); 554 555 if (why != RESTART_NONE && clusters_to_add) { 556 if (why == RESTART_META) { 557 mlog(0, "restarting function.\n"); 558 restart_func = 1; 559 } else { 560 BUG_ON(why != RESTART_TRANS); 561 562 mlog(0, "restarting transaction.\n"); 563 /* TODO: This can be more intelligent. */ 564 credits = ocfs2_calc_extend_credits(osb->sb, 565 fe, 566 clusters_to_add); 567 status = ocfs2_extend_trans(handle, credits); 568 if (status < 0) { 569 /* handle still has to be committed at 570 * this point. */ 571 status = -ENOMEM; 572 mlog_errno(status); 573 goto leave; 574 } 575 goto restarted_transaction; 576 } 577 } 578 579 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 580 fe->i_clusters, (unsigned long long)fe->i_size); 581 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 582 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 583 584 leave: 585 if (drop_alloc_sem) { 586 up_write(&OCFS2_I(inode)->ip_alloc_sem); 587 drop_alloc_sem = 0; 588 } 589 if (handle) { 590 ocfs2_commit_trans(handle); 591 handle = NULL; 592 } 593 if (data_ac) { 594 ocfs2_free_alloc_context(data_ac); 595 data_ac = NULL; 596 } 597 if (meta_ac) { 598 ocfs2_free_alloc_context(meta_ac); 599 meta_ac = NULL; 600 } 601 if ((!status) && restart_func) { 602 restart_func = 0; 603 goto restart_all; 604 } 605 if (bh) { 606 brelse(bh); 607 bh = NULL; 608 } 609 610 mlog_exit(status); 611 return status; 612 } 613 614 /* Some parts of this taken from generic_cont_expand, which turned out 615 * to be too fragile to do exactly what we need without us having to 616 * worry about recursive locking in ->commit_write(). */ 617 static int ocfs2_write_zero_page(struct inode *inode, 618 u64 size) 619 { 620 struct address_space *mapping = inode->i_mapping; 621 struct page *page; 622 unsigned long index; 623 unsigned int offset; 624 struct ocfs2_journal_handle *handle = NULL; 625 int ret; 626 627 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 628 /* ugh. in prepare/commit_write, if from==to==start of block, we 629 ** skip the prepare. make sure we never send an offset for the start 630 ** of a block 631 */ 632 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 633 offset++; 634 } 635 index = size >> PAGE_CACHE_SHIFT; 636 637 page = grab_cache_page(mapping, index); 638 if (!page) { 639 ret = -ENOMEM; 640 mlog_errno(ret); 641 goto out; 642 } 643 644 ret = ocfs2_prepare_write(NULL, page, offset, offset); 645 if (ret < 0) { 646 mlog_errno(ret); 647 goto out_unlock; 648 } 649 650 if (ocfs2_should_order_data(inode)) { 651 handle = ocfs2_start_walk_page_trans(inode, page, offset, 652 offset); 653 if (IS_ERR(handle)) { 654 ret = PTR_ERR(handle); 655 handle = NULL; 656 goto out_unlock; 657 } 658 } 659 660 /* must not update i_size! */ 661 ret = block_commit_write(page, offset, offset); 662 if (ret < 0) 663 mlog_errno(ret); 664 else 665 ret = 0; 666 667 if (handle) 668 ocfs2_commit_trans(handle); 669 out_unlock: 670 unlock_page(page); 671 page_cache_release(page); 672 out: 673 return ret; 674 } 675 676 static int ocfs2_zero_extend(struct inode *inode, 677 u64 zero_to_size) 678 { 679 int ret = 0; 680 u64 start_off; 681 struct super_block *sb = inode->i_sb; 682 683 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 684 while (start_off < zero_to_size) { 685 ret = ocfs2_write_zero_page(inode, start_off); 686 if (ret < 0) { 687 mlog_errno(ret); 688 goto out; 689 } 690 691 start_off += sb->s_blocksize; 692 } 693 694 out: 695 return ret; 696 } 697 698 static int ocfs2_extend_file(struct inode *inode, 699 struct buffer_head *di_bh, 700 u64 new_i_size) 701 { 702 int ret = 0; 703 u32 clusters_to_add; 704 705 /* setattr sometimes calls us like this. */ 706 if (new_i_size == 0) 707 goto out; 708 709 if (i_size_read(inode) == new_i_size) 710 goto out; 711 BUG_ON(new_i_size < i_size_read(inode)); 712 713 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 714 OCFS2_I(inode)->ip_clusters; 715 716 if (clusters_to_add) { 717 ret = ocfs2_extend_allocation(inode, clusters_to_add); 718 if (ret < 0) { 719 mlog_errno(ret); 720 goto out; 721 } 722 723 ret = ocfs2_zero_extend(inode, new_i_size); 724 if (ret < 0) { 725 mlog_errno(ret); 726 goto out; 727 } 728 } 729 730 /* No allocation required, we just use this helper to 731 * do a trivial update of i_size. */ 732 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 733 if (ret < 0) { 734 mlog_errno(ret); 735 goto out; 736 } 737 738 out: 739 return ret; 740 } 741 742 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 743 { 744 int status = 0, size_change; 745 struct inode *inode = dentry->d_inode; 746 struct super_block *sb = inode->i_sb; 747 struct ocfs2_super *osb = OCFS2_SB(sb); 748 struct buffer_head *bh = NULL; 749 struct ocfs2_journal_handle *handle = NULL; 750 751 mlog_entry("(0x%p, '%.*s')\n", dentry, 752 dentry->d_name.len, dentry->d_name.name); 753 754 if (attr->ia_valid & ATTR_MODE) 755 mlog(0, "mode change: %d\n", attr->ia_mode); 756 if (attr->ia_valid & ATTR_UID) 757 mlog(0, "uid change: %d\n", attr->ia_uid); 758 if (attr->ia_valid & ATTR_GID) 759 mlog(0, "gid change: %d\n", attr->ia_gid); 760 if (attr->ia_valid & ATTR_SIZE) 761 mlog(0, "size change...\n"); 762 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 763 mlog(0, "time change...\n"); 764 765 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 766 | ATTR_GID | ATTR_UID | ATTR_MODE) 767 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 768 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 769 return 0; 770 } 771 772 status = inode_change_ok(inode, attr); 773 if (status) 774 return status; 775 776 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 777 if (size_change) { 778 status = ocfs2_rw_lock(inode, 1); 779 if (status < 0) { 780 mlog_errno(status); 781 goto bail; 782 } 783 } 784 785 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 786 if (status < 0) { 787 if (status != -ENOENT) 788 mlog_errno(status); 789 goto bail_unlock_rw; 790 } 791 792 if (size_change && attr->ia_size != i_size_read(inode)) { 793 if (i_size_read(inode) > attr->ia_size) 794 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 795 else 796 status = ocfs2_extend_file(inode, bh, attr->ia_size); 797 if (status < 0) { 798 if (status != -ENOSPC) 799 mlog_errno(status); 800 status = -ENOSPC; 801 goto bail_unlock; 802 } 803 } 804 805 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 806 if (IS_ERR(handle)) { 807 status = PTR_ERR(handle); 808 mlog_errno(status); 809 goto bail_unlock; 810 } 811 812 status = inode_setattr(inode, attr); 813 if (status < 0) { 814 mlog_errno(status); 815 goto bail_commit; 816 } 817 818 status = ocfs2_mark_inode_dirty(handle, inode, bh); 819 if (status < 0) 820 mlog_errno(status); 821 822 bail_commit: 823 ocfs2_commit_trans(handle); 824 bail_unlock: 825 ocfs2_meta_unlock(inode, 1); 826 bail_unlock_rw: 827 if (size_change) 828 ocfs2_rw_unlock(inode, 1); 829 bail: 830 if (bh) 831 brelse(bh); 832 833 mlog_exit(status); 834 return status; 835 } 836 837 int ocfs2_getattr(struct vfsmount *mnt, 838 struct dentry *dentry, 839 struct kstat *stat) 840 { 841 struct inode *inode = dentry->d_inode; 842 struct super_block *sb = dentry->d_inode->i_sb; 843 struct ocfs2_super *osb = sb->s_fs_info; 844 int err; 845 846 mlog_entry_void(); 847 848 err = ocfs2_inode_revalidate(dentry); 849 if (err) { 850 if (err != -ENOENT) 851 mlog_errno(err); 852 goto bail; 853 } 854 855 generic_fillattr(inode, stat); 856 857 /* We set the blksize from the cluster size for performance */ 858 stat->blksize = osb->s_clustersize; 859 860 bail: 861 mlog_exit(err); 862 863 return err; 864 } 865 866 static int ocfs2_write_remove_suid(struct inode *inode) 867 { 868 int ret; 869 struct buffer_head *bh = NULL; 870 struct ocfs2_inode_info *oi = OCFS2_I(inode); 871 struct ocfs2_journal_handle *handle; 872 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 873 struct ocfs2_dinode *di; 874 875 mlog_entry("(Inode %llu, mode 0%o)\n", 876 (unsigned long long)oi->ip_blkno, inode->i_mode); 877 878 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); 879 if (handle == NULL) { 880 ret = -ENOMEM; 881 mlog_errno(ret); 882 goto out; 883 } 884 885 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 886 if (ret < 0) { 887 mlog_errno(ret); 888 goto out_trans; 889 } 890 891 ret = ocfs2_journal_access(handle, inode, bh, 892 OCFS2_JOURNAL_ACCESS_WRITE); 893 if (ret < 0) { 894 mlog_errno(ret); 895 goto out_bh; 896 } 897 898 inode->i_mode &= ~S_ISUID; 899 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 900 inode->i_mode &= ~S_ISGID; 901 902 di = (struct ocfs2_dinode *) bh->b_data; 903 di->i_mode = cpu_to_le16(inode->i_mode); 904 905 ret = ocfs2_journal_dirty(handle, bh); 906 if (ret < 0) 907 mlog_errno(ret); 908 out_bh: 909 brelse(bh); 910 out_trans: 911 ocfs2_commit_trans(handle); 912 out: 913 mlog_exit(ret); 914 return ret; 915 } 916 917 static inline int ocfs2_write_should_remove_suid(struct inode *inode) 918 { 919 mode_t mode = inode->i_mode; 920 921 if (!capable(CAP_FSETID)) { 922 if (unlikely(mode & S_ISUID)) 923 return 1; 924 925 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 926 return 1; 927 } 928 return 0; 929 } 930 931 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 932 const char __user *buf, 933 size_t count, 934 loff_t pos) 935 { 936 struct iovec local_iov = { .iov_base = (void __user *)buf, 937 .iov_len = count }; 938 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; 939 u32 clusters; 940 struct file *filp = iocb->ki_filp; 941 struct inode *inode = filp->f_dentry->d_inode; 942 loff_t newsize, saved_pos; 943 944 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 945 (unsigned int)count, 946 filp->f_dentry->d_name.len, 947 filp->f_dentry->d_name.name); 948 949 /* happy write of zero bytes */ 950 if (count == 0) 951 return 0; 952 953 if (!inode) { 954 mlog(0, "bad inode\n"); 955 return -EIO; 956 } 957 958 mutex_lock(&inode->i_mutex); 959 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 960 if (filp->f_flags & O_DIRECT) { 961 have_alloc_sem = 1; 962 down_read(&inode->i_alloc_sem); 963 } 964 965 /* concurrent O_DIRECT writes are allowed */ 966 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 967 ret = ocfs2_rw_lock(inode, rw_level); 968 if (ret < 0) { 969 rw_level = -1; 970 mlog_errno(ret); 971 goto out; 972 } 973 974 /* 975 * We sample i_size under a read level meta lock to see if our write 976 * is extending the file, if it is we back off and get a write level 977 * meta lock. 978 */ 979 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; 980 for(;;) { 981 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); 982 if (ret < 0) { 983 meta_level = -1; 984 mlog_errno(ret); 985 goto out; 986 } 987 988 /* Clear suid / sgid if necessary. We do this here 989 * instead of later in the write path because 990 * remove_suid() calls ->setattr without any hint that 991 * we may have already done our cluster locking. Since 992 * ocfs2_setattr() *must* take cluster locks to 993 * proceeed, this will lead us to recursively lock the 994 * inode. There's also the dinode i_size state which 995 * can be lost via setattr during extending writes (we 996 * set inode->i_size at the end of a write. */ 997 if (ocfs2_write_should_remove_suid(inode)) { 998 if (meta_level == 0) { 999 ocfs2_meta_unlock(inode, meta_level); 1000 meta_level = 1; 1001 continue; 1002 } 1003 1004 ret = ocfs2_write_remove_suid(inode); 1005 if (ret < 0) { 1006 mlog_errno(ret); 1007 goto out; 1008 } 1009 } 1010 1011 /* work on a copy of ppos until we're sure that we won't have 1012 * to recalculate it due to relocking. */ 1013 if (filp->f_flags & O_APPEND) { 1014 saved_pos = i_size_read(inode); 1015 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1016 } else { 1017 saved_pos = iocb->ki_pos; 1018 } 1019 newsize = count + saved_pos; 1020 1021 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1022 (long long) saved_pos, (long long) newsize, 1023 (long long) i_size_read(inode)); 1024 1025 /* No need for a higher level metadata lock if we're 1026 * never going past i_size. */ 1027 if (newsize <= i_size_read(inode)) 1028 break; 1029 1030 if (meta_level == 0) { 1031 ocfs2_meta_unlock(inode, meta_level); 1032 meta_level = 1; 1033 continue; 1034 } 1035 1036 spin_lock(&OCFS2_I(inode)->ip_lock); 1037 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1038 OCFS2_I(inode)->ip_clusters; 1039 spin_unlock(&OCFS2_I(inode)->ip_lock); 1040 1041 mlog(0, "Writing at EOF, may need more allocation: " 1042 "i_size = %lld, newsize = %lld, need %u clusters\n", 1043 (long long) i_size_read(inode), (long long) newsize, 1044 clusters); 1045 1046 /* We only want to continue the rest of this loop if 1047 * our extend will actually require more 1048 * allocation. */ 1049 if (!clusters) 1050 break; 1051 1052 ret = ocfs2_extend_allocation(inode, clusters); 1053 if (ret < 0) { 1054 if (ret != -ENOSPC) 1055 mlog_errno(ret); 1056 goto out; 1057 } 1058 1059 /* Fill any holes which would've been created by this 1060 * write. If we're O_APPEND, this will wind up 1061 * (correctly) being a noop. */ 1062 ret = ocfs2_zero_extend(inode, (u64) newsize - count); 1063 if (ret < 0) { 1064 mlog_errno(ret); 1065 goto out; 1066 } 1067 break; 1068 } 1069 1070 /* ok, we're done with i_size and alloc work */ 1071 iocb->ki_pos = saved_pos; 1072 ocfs2_meta_unlock(inode, meta_level); 1073 meta_level = -1; 1074 1075 /* communicate with ocfs2_dio_end_io */ 1076 ocfs2_iocb_set_rw_locked(iocb); 1077 1078 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1079 1080 /* buffered aio wouldn't have proper lock coverage today */ 1081 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1082 1083 /* 1084 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1085 * function pointer which is called when o_direct io completes so that 1086 * it can unlock our rw lock. (it's the clustered equivalent of 1087 * i_alloc_sem; protects truncate from racing with pending ios). 1088 * Unfortunately there are error cases which call end_io and others 1089 * that don't. so we don't have to unlock the rw_lock if either an 1090 * async dio is going to do it in the future or an end_io after an 1091 * error has already done it. 1092 */ 1093 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1094 rw_level = -1; 1095 have_alloc_sem = 0; 1096 } 1097 1098 out: 1099 if (meta_level != -1) 1100 ocfs2_meta_unlock(inode, meta_level); 1101 if (have_alloc_sem) 1102 up_read(&inode->i_alloc_sem); 1103 if (rw_level != -1) 1104 ocfs2_rw_unlock(inode, rw_level); 1105 mutex_unlock(&inode->i_mutex); 1106 1107 mlog_exit(ret); 1108 return ret; 1109 } 1110 1111 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1112 char __user *buf, 1113 size_t count, 1114 loff_t pos) 1115 { 1116 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1117 struct file *filp = iocb->ki_filp; 1118 struct inode *inode = filp->f_dentry->d_inode; 1119 1120 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1121 (unsigned int)count, 1122 filp->f_dentry->d_name.len, 1123 filp->f_dentry->d_name.name); 1124 1125 if (!inode) { 1126 ret = -EINVAL; 1127 mlog_errno(ret); 1128 goto bail; 1129 } 1130 1131 /* 1132 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1133 * need locks to protect pending reads from racing with truncate. 1134 */ 1135 if (filp->f_flags & O_DIRECT) { 1136 down_read(&inode->i_alloc_sem); 1137 have_alloc_sem = 1; 1138 1139 ret = ocfs2_rw_lock(inode, 0); 1140 if (ret < 0) { 1141 mlog_errno(ret); 1142 goto bail; 1143 } 1144 rw_level = 0; 1145 /* communicate with ocfs2_dio_end_io */ 1146 ocfs2_iocb_set_rw_locked(iocb); 1147 } 1148 1149 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); 1150 if (ret == -EINVAL) 1151 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1152 1153 /* buffered aio wouldn't have proper lock coverage today */ 1154 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1155 1156 /* see ocfs2_file_aio_write */ 1157 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1158 rw_level = -1; 1159 have_alloc_sem = 0; 1160 } 1161 1162 bail: 1163 if (have_alloc_sem) 1164 up_read(&inode->i_alloc_sem); 1165 if (rw_level != -1) 1166 ocfs2_rw_unlock(inode, rw_level); 1167 mlog_exit(ret); 1168 1169 return ret; 1170 } 1171 1172 struct inode_operations ocfs2_file_iops = { 1173 .setattr = ocfs2_setattr, 1174 .getattr = ocfs2_getattr, 1175 }; 1176 1177 struct inode_operations ocfs2_special_file_iops = { 1178 .setattr = ocfs2_setattr, 1179 .getattr = ocfs2_getattr, 1180 }; 1181 1182 const struct file_operations ocfs2_fops = { 1183 .read = do_sync_read, 1184 .write = do_sync_write, 1185 .sendfile = generic_file_sendfile, 1186 .mmap = ocfs2_mmap, 1187 .fsync = ocfs2_sync_file, 1188 .release = ocfs2_file_release, 1189 .open = ocfs2_file_open, 1190 .aio_read = ocfs2_file_aio_read, 1191 .aio_write = ocfs2_file_aio_write, 1192 }; 1193 1194 const struct file_operations ocfs2_dops = { 1195 .read = generic_read_dir, 1196 .readdir = ocfs2_readdir, 1197 .fsync = ocfs2_sync_file, 1198 }; 1199