1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 #include <linux/sched.h> 34 #include <linux/pipe_fs_i.h> 35 #include <linux/mount.h> 36 37 #define MLOG_MASK_PREFIX ML_INODE 38 #include <cluster/masklog.h> 39 40 #include "ocfs2.h" 41 42 #include "alloc.h" 43 #include "aops.h" 44 #include "dir.h" 45 #include "dlmglue.h" 46 #include "extent_map.h" 47 #include "file.h" 48 #include "sysfile.h" 49 #include "inode.h" 50 #include "ioctl.h" 51 #include "journal.h" 52 #include "mmap.h" 53 #include "suballoc.h" 54 #include "super.h" 55 56 #include "buffer_head_io.h" 57 58 static int ocfs2_sync_inode(struct inode *inode) 59 { 60 filemap_fdatawrite(inode->i_mapping); 61 return sync_mapping_buffers(inode->i_mapping); 62 } 63 64 static int ocfs2_file_open(struct inode *inode, struct file *file) 65 { 66 int status; 67 int mode = file->f_flags; 68 struct ocfs2_inode_info *oi = OCFS2_I(inode); 69 70 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 71 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 72 73 spin_lock(&oi->ip_lock); 74 75 /* Check that the inode hasn't been wiped from disk by another 76 * node. If it hasn't then we're safe as long as we hold the 77 * spin lock until our increment of open count. */ 78 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 79 spin_unlock(&oi->ip_lock); 80 81 status = -ENOENT; 82 goto leave; 83 } 84 85 if (mode & O_DIRECT) 86 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 87 88 oi->ip_open_count++; 89 spin_unlock(&oi->ip_lock); 90 status = 0; 91 leave: 92 mlog_exit(status); 93 return status; 94 } 95 96 static int ocfs2_file_release(struct inode *inode, struct file *file) 97 { 98 struct ocfs2_inode_info *oi = OCFS2_I(inode); 99 100 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 101 file->f_path.dentry->d_name.len, 102 file->f_path.dentry->d_name.name); 103 104 spin_lock(&oi->ip_lock); 105 if (!--oi->ip_open_count) 106 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 107 spin_unlock(&oi->ip_lock); 108 109 mlog_exit(0); 110 111 return 0; 112 } 113 114 static int ocfs2_sync_file(struct file *file, 115 struct dentry *dentry, 116 int datasync) 117 { 118 int err = 0; 119 journal_t *journal; 120 struct inode *inode = dentry->d_inode; 121 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 122 123 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 124 dentry->d_name.len, dentry->d_name.name); 125 126 err = ocfs2_sync_inode(dentry->d_inode); 127 if (err) 128 goto bail; 129 130 journal = osb->journal->j_journal; 131 err = journal_force_commit(journal); 132 133 bail: 134 mlog_exit(err); 135 136 return (err < 0) ? -EIO : 0; 137 } 138 139 int ocfs2_should_update_atime(struct inode *inode, 140 struct vfsmount *vfsmnt) 141 { 142 struct timespec now; 143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 144 145 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 146 return 0; 147 148 if ((inode->i_flags & S_NOATIME) || 149 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 150 return 0; 151 152 /* 153 * We can be called with no vfsmnt structure - NFSD will 154 * sometimes do this. 155 * 156 * Note that our action here is different than touch_atime() - 157 * if we can't tell whether this is a noatime mount, then we 158 * don't know whether to trust the value of s_atime_quantum. 159 */ 160 if (vfsmnt == NULL) 161 return 0; 162 163 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 164 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 165 return 0; 166 167 if (vfsmnt->mnt_flags & MNT_RELATIME) { 168 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 169 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 170 return 1; 171 172 return 0; 173 } 174 175 now = CURRENT_TIME; 176 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 177 return 0; 178 else 179 return 1; 180 } 181 182 int ocfs2_update_inode_atime(struct inode *inode, 183 struct buffer_head *bh) 184 { 185 int ret; 186 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 187 handle_t *handle; 188 189 mlog_entry_void(); 190 191 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 192 if (handle == NULL) { 193 ret = -ENOMEM; 194 mlog_errno(ret); 195 goto out; 196 } 197 198 inode->i_atime = CURRENT_TIME; 199 ret = ocfs2_mark_inode_dirty(handle, inode, bh); 200 if (ret < 0) 201 mlog_errno(ret); 202 203 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 204 out: 205 mlog_exit(ret); 206 return ret; 207 } 208 209 int ocfs2_set_inode_size(handle_t *handle, 210 struct inode *inode, 211 struct buffer_head *fe_bh, 212 u64 new_i_size) 213 { 214 int status; 215 216 mlog_entry_void(); 217 i_size_write(inode, new_i_size); 218 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 219 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 221 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 if (status < 0) { 223 mlog_errno(status); 224 goto bail; 225 } 226 227 bail: 228 mlog_exit(status); 229 return status; 230 } 231 232 static int ocfs2_simple_size_update(struct inode *inode, 233 struct buffer_head *di_bh, 234 u64 new_i_size) 235 { 236 int ret; 237 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 238 handle_t *handle = NULL; 239 240 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 241 if (handle == NULL) { 242 ret = -ENOMEM; 243 mlog_errno(ret); 244 goto out; 245 } 246 247 ret = ocfs2_set_inode_size(handle, inode, di_bh, 248 new_i_size); 249 if (ret < 0) 250 mlog_errno(ret); 251 252 ocfs2_commit_trans(osb, handle); 253 out: 254 return ret; 255 } 256 257 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 258 struct inode *inode, 259 struct buffer_head *fe_bh, 260 u64 new_i_size) 261 { 262 int status; 263 handle_t *handle; 264 265 mlog_entry_void(); 266 267 /* TODO: This needs to actually orphan the inode in this 268 * transaction. */ 269 270 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 271 if (IS_ERR(handle)) { 272 status = PTR_ERR(handle); 273 mlog_errno(status); 274 goto out; 275 } 276 277 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 278 if (status < 0) 279 mlog_errno(status); 280 281 ocfs2_commit_trans(osb, handle); 282 out: 283 mlog_exit(status); 284 return status; 285 } 286 287 static int ocfs2_truncate_file(struct inode *inode, 288 struct buffer_head *di_bh, 289 u64 new_i_size) 290 { 291 int status = 0; 292 struct ocfs2_dinode *fe = NULL; 293 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 294 struct ocfs2_truncate_context *tc = NULL; 295 296 mlog_entry("(inode = %llu, new_i_size = %llu\n", 297 (unsigned long long)OCFS2_I(inode)->ip_blkno, 298 (unsigned long long)new_i_size); 299 300 truncate_inode_pages(inode->i_mapping, new_i_size); 301 302 fe = (struct ocfs2_dinode *) di_bh->b_data; 303 if (!OCFS2_IS_VALID_DINODE(fe)) { 304 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 305 status = -EIO; 306 goto bail; 307 } 308 309 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 310 "Inode %llu, inode i_size = %lld != di " 311 "i_size = %llu, i_flags = 0x%x\n", 312 (unsigned long long)OCFS2_I(inode)->ip_blkno, 313 i_size_read(inode), 314 (unsigned long long)le64_to_cpu(fe->i_size), 315 le32_to_cpu(fe->i_flags)); 316 317 if (new_i_size > le64_to_cpu(fe->i_size)) { 318 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 319 (unsigned long long)le64_to_cpu(fe->i_size), 320 (unsigned long long)new_i_size); 321 status = -EINVAL; 322 mlog_errno(status); 323 goto bail; 324 } 325 326 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 327 (unsigned long long)le64_to_cpu(fe->i_blkno), 328 (unsigned long long)le64_to_cpu(fe->i_size), 329 (unsigned long long)new_i_size); 330 331 /* lets handle the simple truncate cases before doing any more 332 * cluster locking. */ 333 if (new_i_size == le64_to_cpu(fe->i_size)) 334 goto bail; 335 336 /* This forces other nodes to sync and drop their pages. Do 337 * this even if we have a truncate without allocation change - 338 * ocfs2 cluster sizes can be much greater than page size, so 339 * we have to truncate them anyway. */ 340 status = ocfs2_data_lock(inode, 1); 341 if (status < 0) { 342 mlog_errno(status); 343 goto bail; 344 } 345 ocfs2_data_unlock(inode, 1); 346 347 if (le32_to_cpu(fe->i_clusters) == 348 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { 349 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", 350 fe->i_clusters); 351 /* No allocation change is required, so lets fast path 352 * this truncate. */ 353 status = ocfs2_simple_size_update(inode, di_bh, new_i_size); 354 if (status < 0) 355 mlog_errno(status); 356 goto bail; 357 } 358 359 /* alright, we're going to need to do a full blown alloc size 360 * change. Orphan the inode so that recovery can complete the 361 * truncate if necessary. This does the task of marking 362 * i_size. */ 363 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 364 if (status < 0) { 365 mlog_errno(status); 366 goto bail; 367 } 368 369 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 370 if (status < 0) { 371 mlog_errno(status); 372 goto bail; 373 } 374 375 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 376 if (status < 0) { 377 mlog_errno(status); 378 goto bail; 379 } 380 381 /* TODO: orphan dir cleanup here. */ 382 bail: 383 384 mlog_exit(status); 385 return status; 386 } 387 388 /* 389 * extend allocation only here. 390 * we'll update all the disk stuff, and oip->alloc_size 391 * 392 * expect stuff to be locked, a transaction started and enough data / 393 * metadata reservations in the contexts. 394 * 395 * Will return -EAGAIN, and a reason if a restart is needed. 396 * If passed in, *reason will always be set, even in error. 397 */ 398 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 399 struct inode *inode, 400 u32 clusters_to_add, 401 struct buffer_head *fe_bh, 402 handle_t *handle, 403 struct ocfs2_alloc_context *data_ac, 404 struct ocfs2_alloc_context *meta_ac, 405 enum ocfs2_alloc_restarted *reason_ret) 406 { 407 int status = 0; 408 int free_extents; 409 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 410 enum ocfs2_alloc_restarted reason = RESTART_NONE; 411 u32 bit_off, num_bits; 412 u64 block; 413 414 BUG_ON(!clusters_to_add); 415 416 free_extents = ocfs2_num_free_extents(osb, inode, fe); 417 if (free_extents < 0) { 418 status = free_extents; 419 mlog_errno(status); 420 goto leave; 421 } 422 423 /* there are two cases which could cause us to EAGAIN in the 424 * we-need-more-metadata case: 425 * 1) we haven't reserved *any* 426 * 2) we are so fragmented, we've needed to add metadata too 427 * many times. */ 428 if (!free_extents && !meta_ac) { 429 mlog(0, "we haven't reserved any metadata!\n"); 430 status = -EAGAIN; 431 reason = RESTART_META; 432 goto leave; 433 } else if ((!free_extents) 434 && (ocfs2_alloc_context_bits_left(meta_ac) 435 < ocfs2_extend_meta_needed(fe))) { 436 mlog(0, "filesystem is really fragmented...\n"); 437 status = -EAGAIN; 438 reason = RESTART_META; 439 goto leave; 440 } 441 442 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 443 &bit_off, &num_bits); 444 if (status < 0) { 445 if (status != -ENOSPC) 446 mlog_errno(status); 447 goto leave; 448 } 449 450 BUG_ON(num_bits > clusters_to_add); 451 452 /* reserve our write early -- insert_extent may update the inode */ 453 status = ocfs2_journal_access(handle, inode, fe_bh, 454 OCFS2_JOURNAL_ACCESS_WRITE); 455 if (status < 0) { 456 mlog_errno(status); 457 goto leave; 458 } 459 460 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 461 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 462 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 463 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 464 num_bits, meta_ac); 465 if (status < 0) { 466 mlog_errno(status); 467 goto leave; 468 } 469 470 le32_add_cpu(&fe->i_clusters, num_bits); 471 spin_lock(&OCFS2_I(inode)->ip_lock); 472 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 473 spin_unlock(&OCFS2_I(inode)->ip_lock); 474 475 status = ocfs2_journal_dirty(handle, fe_bh); 476 if (status < 0) { 477 mlog_errno(status); 478 goto leave; 479 } 480 481 clusters_to_add -= num_bits; 482 483 if (clusters_to_add) { 484 mlog(0, "need to alloc once more, clusters = %u, wanted = " 485 "%u\n", fe->i_clusters, clusters_to_add); 486 status = -EAGAIN; 487 reason = RESTART_TRANS; 488 } 489 490 leave: 491 mlog_exit(status); 492 if (reason_ret) 493 *reason_ret = reason; 494 return status; 495 } 496 497 static int ocfs2_extend_allocation(struct inode *inode, 498 u32 clusters_to_add) 499 { 500 int status = 0; 501 int restart_func = 0; 502 int drop_alloc_sem = 0; 503 int credits, num_free_extents; 504 u32 prev_clusters; 505 struct buffer_head *bh = NULL; 506 struct ocfs2_dinode *fe = NULL; 507 handle_t *handle = NULL; 508 struct ocfs2_alloc_context *data_ac = NULL; 509 struct ocfs2_alloc_context *meta_ac = NULL; 510 enum ocfs2_alloc_restarted why; 511 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 512 513 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 514 515 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 516 OCFS2_BH_CACHED, inode); 517 if (status < 0) { 518 mlog_errno(status); 519 goto leave; 520 } 521 522 fe = (struct ocfs2_dinode *) bh->b_data; 523 if (!OCFS2_IS_VALID_DINODE(fe)) { 524 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 525 status = -EIO; 526 goto leave; 527 } 528 529 restart_all: 530 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 531 532 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " 533 "clusters_to_add = %u\n", 534 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 535 fe->i_clusters, clusters_to_add); 536 537 num_free_extents = ocfs2_num_free_extents(osb, 538 inode, 539 fe); 540 if (num_free_extents < 0) { 541 status = num_free_extents; 542 mlog_errno(status); 543 goto leave; 544 } 545 546 if (!num_free_extents) { 547 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); 548 if (status < 0) { 549 if (status != -ENOSPC) 550 mlog_errno(status); 551 goto leave; 552 } 553 } 554 555 status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); 556 if (status < 0) { 557 if (status != -ENOSPC) 558 mlog_errno(status); 559 goto leave; 560 } 561 562 /* blocks peope in read/write from reading our allocation 563 * until we're done changing it. We depend on i_mutex to block 564 * other extend/truncate calls while we're here. Ordering wrt 565 * start_trans is important here -- always do it before! */ 566 down_write(&OCFS2_I(inode)->ip_alloc_sem); 567 drop_alloc_sem = 1; 568 569 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 570 handle = ocfs2_start_trans(osb, credits); 571 if (IS_ERR(handle)) { 572 status = PTR_ERR(handle); 573 handle = NULL; 574 mlog_errno(status); 575 goto leave; 576 } 577 578 restarted_transaction: 579 /* reserve a write to the file entry early on - that we if we 580 * run out of credits in the allocation path, we can still 581 * update i_size. */ 582 status = ocfs2_journal_access(handle, inode, bh, 583 OCFS2_JOURNAL_ACCESS_WRITE); 584 if (status < 0) { 585 mlog_errno(status); 586 goto leave; 587 } 588 589 prev_clusters = OCFS2_I(inode)->ip_clusters; 590 591 status = ocfs2_do_extend_allocation(osb, 592 inode, 593 clusters_to_add, 594 bh, 595 handle, 596 data_ac, 597 meta_ac, 598 &why); 599 if ((status < 0) && (status != -EAGAIN)) { 600 if (status != -ENOSPC) 601 mlog_errno(status); 602 goto leave; 603 } 604 605 status = ocfs2_journal_dirty(handle, bh); 606 if (status < 0) { 607 mlog_errno(status); 608 goto leave; 609 } 610 611 spin_lock(&OCFS2_I(inode)->ip_lock); 612 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 613 spin_unlock(&OCFS2_I(inode)->ip_lock); 614 615 if (why != RESTART_NONE && clusters_to_add) { 616 if (why == RESTART_META) { 617 mlog(0, "restarting function.\n"); 618 restart_func = 1; 619 } else { 620 BUG_ON(why != RESTART_TRANS); 621 622 mlog(0, "restarting transaction.\n"); 623 /* TODO: This can be more intelligent. */ 624 credits = ocfs2_calc_extend_credits(osb->sb, 625 fe, 626 clusters_to_add); 627 status = ocfs2_extend_trans(handle, credits); 628 if (status < 0) { 629 /* handle still has to be committed at 630 * this point. */ 631 status = -ENOMEM; 632 mlog_errno(status); 633 goto leave; 634 } 635 goto restarted_transaction; 636 } 637 } 638 639 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 640 fe->i_clusters, (unsigned long long)fe->i_size); 641 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 642 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 643 644 leave: 645 if (drop_alloc_sem) { 646 up_write(&OCFS2_I(inode)->ip_alloc_sem); 647 drop_alloc_sem = 0; 648 } 649 if (handle) { 650 ocfs2_commit_trans(osb, handle); 651 handle = NULL; 652 } 653 if (data_ac) { 654 ocfs2_free_alloc_context(data_ac); 655 data_ac = NULL; 656 } 657 if (meta_ac) { 658 ocfs2_free_alloc_context(meta_ac); 659 meta_ac = NULL; 660 } 661 if ((!status) && restart_func) { 662 restart_func = 0; 663 goto restart_all; 664 } 665 if (bh) { 666 brelse(bh); 667 bh = NULL; 668 } 669 670 mlog_exit(status); 671 return status; 672 } 673 674 /* Some parts of this taken from generic_cont_expand, which turned out 675 * to be too fragile to do exactly what we need without us having to 676 * worry about recursive locking in ->prepare_write() and 677 * ->commit_write(). */ 678 static int ocfs2_write_zero_page(struct inode *inode, 679 u64 size) 680 { 681 struct address_space *mapping = inode->i_mapping; 682 struct page *page; 683 unsigned long index; 684 unsigned int offset; 685 handle_t *handle = NULL; 686 int ret; 687 688 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 689 /* ugh. in prepare/commit_write, if from==to==start of block, we 690 ** skip the prepare. make sure we never send an offset for the start 691 ** of a block 692 */ 693 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 694 offset++; 695 } 696 index = size >> PAGE_CACHE_SHIFT; 697 698 page = grab_cache_page(mapping, index); 699 if (!page) { 700 ret = -ENOMEM; 701 mlog_errno(ret); 702 goto out; 703 } 704 705 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 706 if (ret < 0) { 707 mlog_errno(ret); 708 goto out_unlock; 709 } 710 711 if (ocfs2_should_order_data(inode)) { 712 handle = ocfs2_start_walk_page_trans(inode, page, offset, 713 offset); 714 if (IS_ERR(handle)) { 715 ret = PTR_ERR(handle); 716 handle = NULL; 717 goto out_unlock; 718 } 719 } 720 721 /* must not update i_size! */ 722 ret = block_commit_write(page, offset, offset); 723 if (ret < 0) 724 mlog_errno(ret); 725 else 726 ret = 0; 727 728 if (handle) 729 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 730 out_unlock: 731 unlock_page(page); 732 page_cache_release(page); 733 out: 734 return ret; 735 } 736 737 static int ocfs2_zero_extend(struct inode *inode, 738 u64 zero_to_size) 739 { 740 int ret = 0; 741 u64 start_off; 742 struct super_block *sb = inode->i_sb; 743 744 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 745 while (start_off < zero_to_size) { 746 ret = ocfs2_write_zero_page(inode, start_off); 747 if (ret < 0) { 748 mlog_errno(ret); 749 goto out; 750 } 751 752 start_off += sb->s_blocksize; 753 754 /* 755 * Very large extends have the potential to lock up 756 * the cpu for extended periods of time. 757 */ 758 cond_resched(); 759 } 760 761 out: 762 return ret; 763 } 764 765 /* 766 * A tail_to_skip value > 0 indicates that we're being called from 767 * ocfs2_file_aio_write(). This has the following implications: 768 * 769 * - we don't want to update i_size 770 * - di_bh will be NULL, which is fine because it's only used in the 771 * case where we want to update i_size. 772 * - ocfs2_zero_extend() will then only be filling the hole created 773 * between i_size and the start of the write. 774 */ 775 static int ocfs2_extend_file(struct inode *inode, 776 struct buffer_head *di_bh, 777 u64 new_i_size, 778 size_t tail_to_skip) 779 { 780 int ret = 0; 781 u32 clusters_to_add; 782 783 BUG_ON(!tail_to_skip && !di_bh); 784 785 /* setattr sometimes calls us like this. */ 786 if (new_i_size == 0) 787 goto out; 788 789 if (i_size_read(inode) == new_i_size) 790 goto out; 791 BUG_ON(new_i_size < i_size_read(inode)); 792 793 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 794 OCFS2_I(inode)->ip_clusters; 795 796 /* 797 * protect the pages that ocfs2_zero_extend is going to be 798 * pulling into the page cache.. we do this before the 799 * metadata extend so that we don't get into the situation 800 * where we've extended the metadata but can't get the data 801 * lock to zero. 802 */ 803 ret = ocfs2_data_lock(inode, 1); 804 if (ret < 0) { 805 mlog_errno(ret); 806 goto out; 807 } 808 809 if (clusters_to_add) { 810 ret = ocfs2_extend_allocation(inode, clusters_to_add); 811 if (ret < 0) { 812 mlog_errno(ret); 813 goto out_unlock; 814 } 815 } 816 817 /* 818 * Call this even if we don't add any clusters to the tree. We 819 * still need to zero the area between the old i_size and the 820 * new i_size. 821 */ 822 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 823 if (ret < 0) { 824 mlog_errno(ret); 825 goto out_unlock; 826 } 827 828 if (!tail_to_skip) { 829 /* We're being called from ocfs2_setattr() which wants 830 * us to update i_size */ 831 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 832 if (ret < 0) 833 mlog_errno(ret); 834 } 835 836 out_unlock: 837 ocfs2_data_unlock(inode, 1); 838 839 out: 840 return ret; 841 } 842 843 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 844 { 845 int status = 0, size_change; 846 struct inode *inode = dentry->d_inode; 847 struct super_block *sb = inode->i_sb; 848 struct ocfs2_super *osb = OCFS2_SB(sb); 849 struct buffer_head *bh = NULL; 850 handle_t *handle = NULL; 851 852 mlog_entry("(0x%p, '%.*s')\n", dentry, 853 dentry->d_name.len, dentry->d_name.name); 854 855 if (attr->ia_valid & ATTR_MODE) 856 mlog(0, "mode change: %d\n", attr->ia_mode); 857 if (attr->ia_valid & ATTR_UID) 858 mlog(0, "uid change: %d\n", attr->ia_uid); 859 if (attr->ia_valid & ATTR_GID) 860 mlog(0, "gid change: %d\n", attr->ia_gid); 861 if (attr->ia_valid & ATTR_SIZE) 862 mlog(0, "size change...\n"); 863 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 864 mlog(0, "time change...\n"); 865 866 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 867 | ATTR_GID | ATTR_UID | ATTR_MODE) 868 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 869 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 870 return 0; 871 } 872 873 status = inode_change_ok(inode, attr); 874 if (status) 875 return status; 876 877 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 878 if (size_change) { 879 status = ocfs2_rw_lock(inode, 1); 880 if (status < 0) { 881 mlog_errno(status); 882 goto bail; 883 } 884 } 885 886 status = ocfs2_meta_lock(inode, &bh, 1); 887 if (status < 0) { 888 if (status != -ENOENT) 889 mlog_errno(status); 890 goto bail_unlock_rw; 891 } 892 893 if (size_change && attr->ia_size != i_size_read(inode)) { 894 if (i_size_read(inode) > attr->ia_size) 895 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 896 else 897 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 898 if (status < 0) { 899 if (status != -ENOSPC) 900 mlog_errno(status); 901 status = -ENOSPC; 902 goto bail_unlock; 903 } 904 } 905 906 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 907 if (IS_ERR(handle)) { 908 status = PTR_ERR(handle); 909 mlog_errno(status); 910 goto bail_unlock; 911 } 912 913 status = inode_setattr(inode, attr); 914 if (status < 0) { 915 mlog_errno(status); 916 goto bail_commit; 917 } 918 919 status = ocfs2_mark_inode_dirty(handle, inode, bh); 920 if (status < 0) 921 mlog_errno(status); 922 923 bail_commit: 924 ocfs2_commit_trans(osb, handle); 925 bail_unlock: 926 ocfs2_meta_unlock(inode, 1); 927 bail_unlock_rw: 928 if (size_change) 929 ocfs2_rw_unlock(inode, 1); 930 bail: 931 if (bh) 932 brelse(bh); 933 934 mlog_exit(status); 935 return status; 936 } 937 938 int ocfs2_getattr(struct vfsmount *mnt, 939 struct dentry *dentry, 940 struct kstat *stat) 941 { 942 struct inode *inode = dentry->d_inode; 943 struct super_block *sb = dentry->d_inode->i_sb; 944 struct ocfs2_super *osb = sb->s_fs_info; 945 int err; 946 947 mlog_entry_void(); 948 949 err = ocfs2_inode_revalidate(dentry); 950 if (err) { 951 if (err != -ENOENT) 952 mlog_errno(err); 953 goto bail; 954 } 955 956 generic_fillattr(inode, stat); 957 958 /* We set the blksize from the cluster size for performance */ 959 stat->blksize = osb->s_clustersize; 960 961 bail: 962 mlog_exit(err); 963 964 return err; 965 } 966 967 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 968 { 969 int ret; 970 971 mlog_entry_void(); 972 973 ret = ocfs2_meta_lock(inode, NULL, 0); 974 if (ret) { 975 mlog_errno(ret); 976 goto out; 977 } 978 979 ret = generic_permission(inode, mask, NULL); 980 981 ocfs2_meta_unlock(inode, 0); 982 out: 983 mlog_exit(ret); 984 return ret; 985 } 986 987 static int ocfs2_write_remove_suid(struct inode *inode) 988 { 989 int ret; 990 struct buffer_head *bh = NULL; 991 struct ocfs2_inode_info *oi = OCFS2_I(inode); 992 handle_t *handle; 993 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 994 struct ocfs2_dinode *di; 995 996 mlog_entry("(Inode %llu, mode 0%o)\n", 997 (unsigned long long)oi->ip_blkno, inode->i_mode); 998 999 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1000 if (handle == NULL) { 1001 ret = -ENOMEM; 1002 mlog_errno(ret); 1003 goto out; 1004 } 1005 1006 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 1007 if (ret < 0) { 1008 mlog_errno(ret); 1009 goto out_trans; 1010 } 1011 1012 ret = ocfs2_journal_access(handle, inode, bh, 1013 OCFS2_JOURNAL_ACCESS_WRITE); 1014 if (ret < 0) { 1015 mlog_errno(ret); 1016 goto out_bh; 1017 } 1018 1019 inode->i_mode &= ~S_ISUID; 1020 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1021 inode->i_mode &= ~S_ISGID; 1022 1023 di = (struct ocfs2_dinode *) bh->b_data; 1024 di->i_mode = cpu_to_le16(inode->i_mode); 1025 1026 ret = ocfs2_journal_dirty(handle, bh); 1027 if (ret < 0) 1028 mlog_errno(ret); 1029 out_bh: 1030 brelse(bh); 1031 out_trans: 1032 ocfs2_commit_trans(osb, handle); 1033 out: 1034 mlog_exit(ret); 1035 return ret; 1036 } 1037 1038 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1039 loff_t *ppos, 1040 size_t count, 1041 int appending) 1042 { 1043 int ret = 0, meta_level = appending; 1044 struct inode *inode = dentry->d_inode; 1045 u32 clusters; 1046 loff_t newsize, saved_pos; 1047 1048 /* 1049 * We sample i_size under a read level meta lock to see if our write 1050 * is extending the file, if it is we back off and get a write level 1051 * meta lock. 1052 */ 1053 for(;;) { 1054 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1055 if (ret < 0) { 1056 meta_level = -1; 1057 mlog_errno(ret); 1058 goto out; 1059 } 1060 1061 /* Clear suid / sgid if necessary. We do this here 1062 * instead of later in the write path because 1063 * remove_suid() calls ->setattr without any hint that 1064 * we may have already done our cluster locking. Since 1065 * ocfs2_setattr() *must* take cluster locks to 1066 * proceeed, this will lead us to recursively lock the 1067 * inode. There's also the dinode i_size state which 1068 * can be lost via setattr during extending writes (we 1069 * set inode->i_size at the end of a write. */ 1070 if (should_remove_suid(dentry)) { 1071 if (meta_level == 0) { 1072 ocfs2_meta_unlock(inode, meta_level); 1073 meta_level = 1; 1074 continue; 1075 } 1076 1077 ret = ocfs2_write_remove_suid(inode); 1078 if (ret < 0) { 1079 mlog_errno(ret); 1080 goto out_unlock; 1081 } 1082 } 1083 1084 /* work on a copy of ppos until we're sure that we won't have 1085 * to recalculate it due to relocking. */ 1086 if (appending) { 1087 saved_pos = i_size_read(inode); 1088 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1089 } else { 1090 saved_pos = *ppos; 1091 } 1092 newsize = count + saved_pos; 1093 1094 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1095 (long long) saved_pos, (long long) newsize, 1096 (long long) i_size_read(inode)); 1097 1098 /* No need for a higher level metadata lock if we're 1099 * never going past i_size. */ 1100 if (newsize <= i_size_read(inode)) 1101 break; 1102 1103 if (meta_level == 0) { 1104 ocfs2_meta_unlock(inode, meta_level); 1105 meta_level = 1; 1106 continue; 1107 } 1108 1109 spin_lock(&OCFS2_I(inode)->ip_lock); 1110 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1111 OCFS2_I(inode)->ip_clusters; 1112 spin_unlock(&OCFS2_I(inode)->ip_lock); 1113 1114 mlog(0, "Writing at EOF, may need more allocation: " 1115 "i_size = %lld, newsize = %lld, need %u clusters\n", 1116 (long long) i_size_read(inode), (long long) newsize, 1117 clusters); 1118 1119 /* We only want to continue the rest of this loop if 1120 * our extend will actually require more 1121 * allocation. */ 1122 if (!clusters) 1123 break; 1124 1125 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1126 if (ret < 0) { 1127 if (ret != -ENOSPC) 1128 mlog_errno(ret); 1129 goto out_unlock; 1130 } 1131 break; 1132 } 1133 1134 if (appending) 1135 *ppos = saved_pos; 1136 1137 out_unlock: 1138 ocfs2_meta_unlock(inode, meta_level); 1139 1140 out: 1141 return ret; 1142 } 1143 1144 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1145 const struct iovec *iov, 1146 unsigned long nr_segs, 1147 loff_t pos) 1148 { 1149 int ret, rw_level, have_alloc_sem = 0; 1150 struct file *filp = iocb->ki_filp; 1151 struct inode *inode = filp->f_path.dentry->d_inode; 1152 int appending = filp->f_flags & O_APPEND ? 1 : 0; 1153 1154 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1155 (unsigned int)nr_segs, 1156 filp->f_path.dentry->d_name.len, 1157 filp->f_path.dentry->d_name.name); 1158 1159 /* happy write of zero bytes */ 1160 if (iocb->ki_left == 0) 1161 return 0; 1162 1163 mutex_lock(&inode->i_mutex); 1164 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1165 if (filp->f_flags & O_DIRECT) { 1166 have_alloc_sem = 1; 1167 down_read(&inode->i_alloc_sem); 1168 } 1169 1170 /* concurrent O_DIRECT writes are allowed */ 1171 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 1172 ret = ocfs2_rw_lock(inode, rw_level); 1173 if (ret < 0) { 1174 rw_level = -1; 1175 mlog_errno(ret); 1176 goto out; 1177 } 1178 1179 ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, 1180 iocb->ki_left, appending); 1181 if (ret < 0) { 1182 mlog_errno(ret); 1183 goto out; 1184 } 1185 1186 /* communicate with ocfs2_dio_end_io */ 1187 ocfs2_iocb_set_rw_locked(iocb); 1188 1189 ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); 1190 1191 /* buffered aio wouldn't have proper lock coverage today */ 1192 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1193 1194 /* 1195 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1196 * function pointer which is called when o_direct io completes so that 1197 * it can unlock our rw lock. (it's the clustered equivalent of 1198 * i_alloc_sem; protects truncate from racing with pending ios). 1199 * Unfortunately there are error cases which call end_io and others 1200 * that don't. so we don't have to unlock the rw_lock if either an 1201 * async dio is going to do it in the future or an end_io after an 1202 * error has already done it. 1203 */ 1204 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1205 rw_level = -1; 1206 have_alloc_sem = 0; 1207 } 1208 1209 out: 1210 if (have_alloc_sem) 1211 up_read(&inode->i_alloc_sem); 1212 if (rw_level != -1) 1213 ocfs2_rw_unlock(inode, rw_level); 1214 mutex_unlock(&inode->i_mutex); 1215 1216 mlog_exit(ret); 1217 return ret; 1218 } 1219 1220 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1221 struct file *out, 1222 loff_t *ppos, 1223 size_t len, 1224 unsigned int flags) 1225 { 1226 int ret; 1227 struct inode *inode = out->f_path.dentry->d_inode; 1228 1229 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1230 (unsigned int)len, 1231 out->f_path.dentry->d_name.len, 1232 out->f_path.dentry->d_name.name); 1233 1234 inode_double_lock(inode, pipe->inode); 1235 1236 ret = ocfs2_rw_lock(inode, 1); 1237 if (ret < 0) { 1238 mlog_errno(ret); 1239 goto out; 1240 } 1241 1242 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); 1243 if (ret < 0) { 1244 mlog_errno(ret); 1245 goto out_unlock; 1246 } 1247 1248 /* ok, we're done with i_size and alloc work */ 1249 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1250 1251 out_unlock: 1252 ocfs2_rw_unlock(inode, 1); 1253 out: 1254 inode_double_unlock(inode, pipe->inode); 1255 1256 mlog_exit(ret); 1257 return ret; 1258 } 1259 1260 static ssize_t ocfs2_file_splice_read(struct file *in, 1261 loff_t *ppos, 1262 struct pipe_inode_info *pipe, 1263 size_t len, 1264 unsigned int flags) 1265 { 1266 int ret = 0; 1267 struct inode *inode = in->f_path.dentry->d_inode; 1268 1269 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 1270 (unsigned int)len, 1271 in->f_path.dentry->d_name.len, 1272 in->f_path.dentry->d_name.name); 1273 1274 /* 1275 * See the comment in ocfs2_file_aio_read() 1276 */ 1277 ret = ocfs2_meta_lock(inode, NULL, 0); 1278 if (ret < 0) { 1279 mlog_errno(ret); 1280 goto bail; 1281 } 1282 ocfs2_meta_unlock(inode, 0); 1283 1284 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 1285 1286 bail: 1287 mlog_exit(ret); 1288 return ret; 1289 } 1290 1291 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1292 const struct iovec *iov, 1293 unsigned long nr_segs, 1294 loff_t pos) 1295 { 1296 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 1297 struct file *filp = iocb->ki_filp; 1298 struct inode *inode = filp->f_path.dentry->d_inode; 1299 1300 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1301 (unsigned int)nr_segs, 1302 filp->f_path.dentry->d_name.len, 1303 filp->f_path.dentry->d_name.name); 1304 1305 if (!inode) { 1306 ret = -EINVAL; 1307 mlog_errno(ret); 1308 goto bail; 1309 } 1310 1311 /* 1312 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1313 * need locks to protect pending reads from racing with truncate. 1314 */ 1315 if (filp->f_flags & O_DIRECT) { 1316 down_read(&inode->i_alloc_sem); 1317 have_alloc_sem = 1; 1318 1319 ret = ocfs2_rw_lock(inode, 0); 1320 if (ret < 0) { 1321 mlog_errno(ret); 1322 goto bail; 1323 } 1324 rw_level = 0; 1325 /* communicate with ocfs2_dio_end_io */ 1326 ocfs2_iocb_set_rw_locked(iocb); 1327 } 1328 1329 /* 1330 * We're fine letting folks race truncates and extending 1331 * writes with read across the cluster, just like they can 1332 * locally. Hence no rw_lock during read. 1333 * 1334 * Take and drop the meta data lock to update inode fields 1335 * like i_size. This allows the checks down below 1336 * generic_file_aio_read() a chance of actually working. 1337 */ 1338 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 1339 if (ret < 0) { 1340 mlog_errno(ret); 1341 goto bail; 1342 } 1343 ocfs2_meta_unlock(inode, lock_level); 1344 1345 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 1346 if (ret == -EINVAL) 1347 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1348 1349 /* buffered aio wouldn't have proper lock coverage today */ 1350 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1351 1352 /* see ocfs2_file_aio_write */ 1353 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1354 rw_level = -1; 1355 have_alloc_sem = 0; 1356 } 1357 1358 bail: 1359 if (have_alloc_sem) 1360 up_read(&inode->i_alloc_sem); 1361 if (rw_level != -1) 1362 ocfs2_rw_unlock(inode, rw_level); 1363 mlog_exit(ret); 1364 1365 return ret; 1366 } 1367 1368 struct inode_operations ocfs2_file_iops = { 1369 .setattr = ocfs2_setattr, 1370 .getattr = ocfs2_getattr, 1371 .permission = ocfs2_permission, 1372 }; 1373 1374 struct inode_operations ocfs2_special_file_iops = { 1375 .setattr = ocfs2_setattr, 1376 .getattr = ocfs2_getattr, 1377 .permission = ocfs2_permission, 1378 }; 1379 1380 const struct file_operations ocfs2_fops = { 1381 .read = do_sync_read, 1382 .write = do_sync_write, 1383 .sendfile = generic_file_sendfile, 1384 .mmap = ocfs2_mmap, 1385 .fsync = ocfs2_sync_file, 1386 .release = ocfs2_file_release, 1387 .open = ocfs2_file_open, 1388 .aio_read = ocfs2_file_aio_read, 1389 .aio_write = ocfs2_file_aio_write, 1390 .ioctl = ocfs2_ioctl, 1391 .splice_read = ocfs2_file_splice_read, 1392 .splice_write = ocfs2_file_splice_write, 1393 }; 1394 1395 const struct file_operations ocfs2_dops = { 1396 .read = generic_read_dir, 1397 .readdir = ocfs2_readdir, 1398 .fsync = ocfs2_sync_file, 1399 .ioctl = ocfs2_ioctl, 1400 }; 1401