1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * file.c 5 * 6 * File open, close, extend, truncate 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/capability.h> 27 #include <linux/fs.h> 28 #include <linux/types.h> 29 #include <linux/slab.h> 30 #include <linux/highmem.h> 31 #include <linux/pagemap.h> 32 #include <linux/uio.h> 33 #include <linux/sched.h> 34 #include <linux/splice.h> 35 #include <linux/mount.h> 36 #include <linux/writeback.h> 37 #include <linux/falloc.h> 38 #include <linux/quotaops.h> 39 40 #define MLOG_MASK_PREFIX ML_INODE 41 #include <cluster/masklog.h> 42 43 #include "ocfs2.h" 44 45 #include "alloc.h" 46 #include "aops.h" 47 #include "dir.h" 48 #include "dlmglue.h" 49 #include "extent_map.h" 50 #include "file.h" 51 #include "sysfile.h" 52 #include "inode.h" 53 #include "ioctl.h" 54 #include "journal.h" 55 #include "locks.h" 56 #include "mmap.h" 57 #include "suballoc.h" 58 #include "super.h" 59 #include "xattr.h" 60 #include "acl.h" 61 #include "quota.h" 62 #include "refcounttree.h" 63 64 #include "buffer_head_io.h" 65 66 static int ocfs2_sync_inode(struct inode *inode) 67 { 68 filemap_fdatawrite(inode->i_mapping); 69 return sync_mapping_buffers(inode->i_mapping); 70 } 71 72 static int ocfs2_init_file_private(struct inode *inode, struct file *file) 73 { 74 struct ocfs2_file_private *fp; 75 76 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); 77 if (!fp) 78 return -ENOMEM; 79 80 fp->fp_file = file; 81 mutex_init(&fp->fp_mutex); 82 ocfs2_file_lock_res_init(&fp->fp_flock, fp); 83 file->private_data = fp; 84 85 return 0; 86 } 87 88 static void ocfs2_free_file_private(struct inode *inode, struct file *file) 89 { 90 struct ocfs2_file_private *fp = file->private_data; 91 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 92 93 if (fp) { 94 ocfs2_simple_drop_lockres(osb, &fp->fp_flock); 95 ocfs2_lock_res_free(&fp->fp_flock); 96 kfree(fp); 97 file->private_data = NULL; 98 } 99 } 100 101 static int ocfs2_file_open(struct inode *inode, struct file *file) 102 { 103 int status; 104 int mode = file->f_flags; 105 struct ocfs2_inode_info *oi = OCFS2_I(inode); 106 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 109 110 if (file->f_mode & FMODE_WRITE) 111 dquot_initialize(inode); 112 113 spin_lock(&oi->ip_lock); 114 115 /* Check that the inode hasn't been wiped from disk by another 116 * node. If it hasn't then we're safe as long as we hold the 117 * spin lock until our increment of open count. */ 118 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 119 spin_unlock(&oi->ip_lock); 120 121 status = -ENOENT; 122 goto leave; 123 } 124 125 if (mode & O_DIRECT) 126 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 127 128 oi->ip_open_count++; 129 spin_unlock(&oi->ip_lock); 130 131 status = ocfs2_init_file_private(inode, file); 132 if (status) { 133 /* 134 * We want to set open count back if we're failing the 135 * open. 136 */ 137 spin_lock(&oi->ip_lock); 138 oi->ip_open_count--; 139 spin_unlock(&oi->ip_lock); 140 } 141 142 leave: 143 mlog_exit(status); 144 return status; 145 } 146 147 static int ocfs2_file_release(struct inode *inode, struct file *file) 148 { 149 struct ocfs2_inode_info *oi = OCFS2_I(inode); 150 151 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 152 file->f_path.dentry->d_name.len, 153 file->f_path.dentry->d_name.name); 154 155 spin_lock(&oi->ip_lock); 156 if (!--oi->ip_open_count) 157 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 158 spin_unlock(&oi->ip_lock); 159 160 ocfs2_free_file_private(inode, file); 161 162 mlog_exit(0); 163 164 return 0; 165 } 166 167 static int ocfs2_dir_open(struct inode *inode, struct file *file) 168 { 169 return ocfs2_init_file_private(inode, file); 170 } 171 172 static int ocfs2_dir_release(struct inode *inode, struct file *file) 173 { 174 ocfs2_free_file_private(inode, file); 175 return 0; 176 } 177 178 static int ocfs2_sync_file(struct file *file, int datasync) 179 { 180 int err = 0; 181 journal_t *journal; 182 struct dentry *dentry = file->f_path.dentry; 183 struct inode *inode = file->f_mapping->host; 184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 185 186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 187 dentry->d_name.len, dentry->d_name.name); 188 189 err = ocfs2_sync_inode(dentry->d_inode); 190 if (err) 191 goto bail; 192 193 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 194 goto bail; 195 196 journal = osb->journal->j_journal; 197 err = jbd2_journal_force_commit(journal); 198 199 bail: 200 mlog_exit(err); 201 202 return (err < 0) ? -EIO : 0; 203 } 204 205 int ocfs2_should_update_atime(struct inode *inode, 206 struct vfsmount *vfsmnt) 207 { 208 struct timespec now; 209 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 210 211 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 212 return 0; 213 214 if ((inode->i_flags & S_NOATIME) || 215 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 216 return 0; 217 218 /* 219 * We can be called with no vfsmnt structure - NFSD will 220 * sometimes do this. 221 * 222 * Note that our action here is different than touch_atime() - 223 * if we can't tell whether this is a noatime mount, then we 224 * don't know whether to trust the value of s_atime_quantum. 225 */ 226 if (vfsmnt == NULL) 227 return 0; 228 229 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 230 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 231 return 0; 232 233 if (vfsmnt->mnt_flags & MNT_RELATIME) { 234 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 235 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 236 return 1; 237 238 return 0; 239 } 240 241 now = CURRENT_TIME; 242 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 243 return 0; 244 else 245 return 1; 246 } 247 248 int ocfs2_update_inode_atime(struct inode *inode, 249 struct buffer_head *bh) 250 { 251 int ret; 252 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 253 handle_t *handle; 254 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 255 256 mlog_entry_void(); 257 258 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 259 if (IS_ERR(handle)) { 260 ret = PTR_ERR(handle); 261 mlog_errno(ret); 262 goto out; 263 } 264 265 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 266 OCFS2_JOURNAL_ACCESS_WRITE); 267 if (ret) { 268 mlog_errno(ret); 269 goto out_commit; 270 } 271 272 /* 273 * Don't use ocfs2_mark_inode_dirty() here as we don't always 274 * have i_mutex to guard against concurrent changes to other 275 * inode fields. 276 */ 277 inode->i_atime = CURRENT_TIME; 278 di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 279 di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 280 ocfs2_journal_dirty(handle, bh); 281 282 out_commit: 283 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 284 out: 285 mlog_exit(ret); 286 return ret; 287 } 288 289 static int ocfs2_set_inode_size(handle_t *handle, 290 struct inode *inode, 291 struct buffer_head *fe_bh, 292 u64 new_i_size) 293 { 294 int status; 295 296 mlog_entry_void(); 297 i_size_write(inode, new_i_size); 298 inode->i_blocks = ocfs2_inode_sector_count(inode); 299 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 300 301 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 302 if (status < 0) { 303 mlog_errno(status); 304 goto bail; 305 } 306 307 bail: 308 mlog_exit(status); 309 return status; 310 } 311 312 int ocfs2_simple_size_update(struct inode *inode, 313 struct buffer_head *di_bh, 314 u64 new_i_size) 315 { 316 int ret; 317 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 318 handle_t *handle = NULL; 319 320 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 321 if (IS_ERR(handle)) { 322 ret = PTR_ERR(handle); 323 mlog_errno(ret); 324 goto out; 325 } 326 327 ret = ocfs2_set_inode_size(handle, inode, di_bh, 328 new_i_size); 329 if (ret < 0) 330 mlog_errno(ret); 331 332 ocfs2_commit_trans(osb, handle); 333 out: 334 return ret; 335 } 336 337 static int ocfs2_cow_file_pos(struct inode *inode, 338 struct buffer_head *fe_bh, 339 u64 offset) 340 { 341 int status; 342 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 343 unsigned int num_clusters = 0; 344 unsigned int ext_flags = 0; 345 346 /* 347 * If the new offset is aligned to the range of the cluster, there is 348 * no space for ocfs2_zero_range_for_truncate to fill, so no need to 349 * CoW either. 350 */ 351 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) 352 return 0; 353 354 status = ocfs2_get_clusters(inode, cpos, &phys, 355 &num_clusters, &ext_flags); 356 if (status) { 357 mlog_errno(status); 358 goto out; 359 } 360 361 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 362 goto out; 363 364 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); 365 366 out: 367 return status; 368 } 369 370 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 371 struct inode *inode, 372 struct buffer_head *fe_bh, 373 u64 new_i_size) 374 { 375 int status; 376 handle_t *handle; 377 struct ocfs2_dinode *di; 378 u64 cluster_bytes; 379 380 mlog_entry_void(); 381 382 /* 383 * We need to CoW the cluster contains the offset if it is reflinked 384 * since we will call ocfs2_zero_range_for_truncate later which will 385 * write "0" from offset to the end of the cluster. 386 */ 387 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); 388 if (status) { 389 mlog_errno(status); 390 return status; 391 } 392 393 /* TODO: This needs to actually orphan the inode in this 394 * transaction. */ 395 396 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 397 if (IS_ERR(handle)) { 398 status = PTR_ERR(handle); 399 mlog_errno(status); 400 goto out; 401 } 402 403 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, 404 OCFS2_JOURNAL_ACCESS_WRITE); 405 if (status < 0) { 406 mlog_errno(status); 407 goto out_commit; 408 } 409 410 /* 411 * Do this before setting i_size. 412 */ 413 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 414 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, 415 cluster_bytes); 416 if (status) { 417 mlog_errno(status); 418 goto out_commit; 419 } 420 421 i_size_write(inode, new_i_size); 422 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 423 424 di = (struct ocfs2_dinode *) fe_bh->b_data; 425 di->i_size = cpu_to_le64(new_i_size); 426 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 427 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 428 429 ocfs2_journal_dirty(handle, fe_bh); 430 431 out_commit: 432 ocfs2_commit_trans(osb, handle); 433 out: 434 435 mlog_exit(status); 436 return status; 437 } 438 439 static int ocfs2_truncate_file(struct inode *inode, 440 struct buffer_head *di_bh, 441 u64 new_i_size) 442 { 443 int status = 0; 444 struct ocfs2_dinode *fe = NULL; 445 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 446 447 mlog_entry("(inode = %llu, new_i_size = %llu\n", 448 (unsigned long long)OCFS2_I(inode)->ip_blkno, 449 (unsigned long long)new_i_size); 450 451 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 452 * already validated it */ 453 fe = (struct ocfs2_dinode *) di_bh->b_data; 454 455 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 456 "Inode %llu, inode i_size = %lld != di " 457 "i_size = %llu, i_flags = 0x%x\n", 458 (unsigned long long)OCFS2_I(inode)->ip_blkno, 459 i_size_read(inode), 460 (unsigned long long)le64_to_cpu(fe->i_size), 461 le32_to_cpu(fe->i_flags)); 462 463 if (new_i_size > le64_to_cpu(fe->i_size)) { 464 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 465 (unsigned long long)le64_to_cpu(fe->i_size), 466 (unsigned long long)new_i_size); 467 status = -EINVAL; 468 mlog_errno(status); 469 goto bail; 470 } 471 472 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 473 (unsigned long long)le64_to_cpu(fe->i_blkno), 474 (unsigned long long)le64_to_cpu(fe->i_size), 475 (unsigned long long)new_i_size); 476 477 /* lets handle the simple truncate cases before doing any more 478 * cluster locking. */ 479 if (new_i_size == le64_to_cpu(fe->i_size)) 480 goto bail; 481 482 down_write(&OCFS2_I(inode)->ip_alloc_sem); 483 484 ocfs2_resv_discard(&osb->osb_la_resmap, 485 &OCFS2_I(inode)->ip_la_data_resv); 486 487 /* 488 * The inode lock forced other nodes to sync and drop their 489 * pages, which (correctly) happens even if we have a truncate 490 * without allocation change - ocfs2 cluster sizes can be much 491 * greater than page size, so we have to truncate them 492 * anyway. 493 */ 494 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 495 truncate_inode_pages(inode->i_mapping, new_i_size); 496 497 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 498 status = ocfs2_truncate_inline(inode, di_bh, new_i_size, 499 i_size_read(inode), 1); 500 if (status) 501 mlog_errno(status); 502 503 goto bail_unlock_sem; 504 } 505 506 /* alright, we're going to need to do a full blown alloc size 507 * change. Orphan the inode so that recovery can complete the 508 * truncate if necessary. This does the task of marking 509 * i_size. */ 510 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 511 if (status < 0) { 512 mlog_errno(status); 513 goto bail_unlock_sem; 514 } 515 516 status = ocfs2_commit_truncate(osb, inode, di_bh); 517 if (status < 0) { 518 mlog_errno(status); 519 goto bail_unlock_sem; 520 } 521 522 /* TODO: orphan dir cleanup here. */ 523 bail_unlock_sem: 524 up_write(&OCFS2_I(inode)->ip_alloc_sem); 525 526 bail: 527 if (!status && OCFS2_I(inode)->ip_clusters == 0) 528 status = ocfs2_try_remove_refcount_tree(inode, di_bh); 529 530 mlog_exit(status); 531 return status; 532 } 533 534 /* 535 * extend file allocation only here. 536 * we'll update all the disk stuff, and oip->alloc_size 537 * 538 * expect stuff to be locked, a transaction started and enough data / 539 * metadata reservations in the contexts. 540 * 541 * Will return -EAGAIN, and a reason if a restart is needed. 542 * If passed in, *reason will always be set, even in error. 543 */ 544 int ocfs2_add_inode_data(struct ocfs2_super *osb, 545 struct inode *inode, 546 u32 *logical_offset, 547 u32 clusters_to_add, 548 int mark_unwritten, 549 struct buffer_head *fe_bh, 550 handle_t *handle, 551 struct ocfs2_alloc_context *data_ac, 552 struct ocfs2_alloc_context *meta_ac, 553 enum ocfs2_alloc_restarted *reason_ret) 554 { 555 int ret; 556 struct ocfs2_extent_tree et; 557 558 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); 559 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, 560 clusters_to_add, mark_unwritten, 561 data_ac, meta_ac, reason_ret); 562 563 return ret; 564 } 565 566 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, 567 u32 clusters_to_add, int mark_unwritten) 568 { 569 int status = 0; 570 int restart_func = 0; 571 int credits; 572 u32 prev_clusters; 573 struct buffer_head *bh = NULL; 574 struct ocfs2_dinode *fe = NULL; 575 handle_t *handle = NULL; 576 struct ocfs2_alloc_context *data_ac = NULL; 577 struct ocfs2_alloc_context *meta_ac = NULL; 578 enum ocfs2_alloc_restarted why; 579 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 580 struct ocfs2_extent_tree et; 581 int did_quota = 0; 582 583 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 584 585 /* 586 * This function only exists for file systems which don't 587 * support holes. 588 */ 589 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 590 591 status = ocfs2_read_inode_block(inode, &bh); 592 if (status < 0) { 593 mlog_errno(status); 594 goto leave; 595 } 596 fe = (struct ocfs2_dinode *) bh->b_data; 597 598 restart_all: 599 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 600 601 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 602 "clusters_to_add = %u\n", 603 (unsigned long long)OCFS2_I(inode)->ip_blkno, 604 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 605 clusters_to_add); 606 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); 607 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 608 &data_ac, &meta_ac); 609 if (status) { 610 mlog_errno(status); 611 goto leave; 612 } 613 614 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, 615 clusters_to_add); 616 handle = ocfs2_start_trans(osb, credits); 617 if (IS_ERR(handle)) { 618 status = PTR_ERR(handle); 619 handle = NULL; 620 mlog_errno(status); 621 goto leave; 622 } 623 624 restarted_transaction: 625 status = dquot_alloc_space_nodirty(inode, 626 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 627 if (status) 628 goto leave; 629 did_quota = 1; 630 631 /* reserve a write to the file entry early on - that we if we 632 * run out of credits in the allocation path, we can still 633 * update i_size. */ 634 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 635 OCFS2_JOURNAL_ACCESS_WRITE); 636 if (status < 0) { 637 mlog_errno(status); 638 goto leave; 639 } 640 641 prev_clusters = OCFS2_I(inode)->ip_clusters; 642 643 status = ocfs2_add_inode_data(osb, 644 inode, 645 &logical_start, 646 clusters_to_add, 647 mark_unwritten, 648 bh, 649 handle, 650 data_ac, 651 meta_ac, 652 &why); 653 if ((status < 0) && (status != -EAGAIN)) { 654 if (status != -ENOSPC) 655 mlog_errno(status); 656 goto leave; 657 } 658 659 ocfs2_journal_dirty(handle, bh); 660 661 spin_lock(&OCFS2_I(inode)->ip_lock); 662 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 663 spin_unlock(&OCFS2_I(inode)->ip_lock); 664 /* Release unused quota reservation */ 665 dquot_free_space(inode, 666 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 667 did_quota = 0; 668 669 if (why != RESTART_NONE && clusters_to_add) { 670 if (why == RESTART_META) { 671 mlog(0, "restarting function.\n"); 672 restart_func = 1; 673 status = 0; 674 } else { 675 BUG_ON(why != RESTART_TRANS); 676 677 mlog(0, "restarting transaction.\n"); 678 /* TODO: This can be more intelligent. */ 679 credits = ocfs2_calc_extend_credits(osb->sb, 680 &fe->id2.i_list, 681 clusters_to_add); 682 status = ocfs2_extend_trans(handle, credits); 683 if (status < 0) { 684 /* handle still has to be committed at 685 * this point. */ 686 status = -ENOMEM; 687 mlog_errno(status); 688 goto leave; 689 } 690 goto restarted_transaction; 691 } 692 } 693 694 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 695 le32_to_cpu(fe->i_clusters), 696 (unsigned long long)le64_to_cpu(fe->i_size)); 697 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 698 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 699 700 leave: 701 if (status < 0 && did_quota) 702 dquot_free_space(inode, 703 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 704 if (handle) { 705 ocfs2_commit_trans(osb, handle); 706 handle = NULL; 707 } 708 if (data_ac) { 709 ocfs2_free_alloc_context(data_ac); 710 data_ac = NULL; 711 } 712 if (meta_ac) { 713 ocfs2_free_alloc_context(meta_ac); 714 meta_ac = NULL; 715 } 716 if ((!status) && restart_func) { 717 restart_func = 0; 718 goto restart_all; 719 } 720 brelse(bh); 721 bh = NULL; 722 723 mlog_exit(status); 724 return status; 725 } 726 727 /* Some parts of this taken from generic_cont_expand, which turned out 728 * to be too fragile to do exactly what we need without us having to 729 * worry about recursive locking in ->write_begin() and ->write_end(). */ 730 static int ocfs2_write_zero_page(struct inode *inode, 731 u64 size) 732 { 733 struct address_space *mapping = inode->i_mapping; 734 struct page *page; 735 unsigned long index; 736 unsigned int offset; 737 handle_t *handle = NULL; 738 int ret; 739 740 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 741 /* ugh. in prepare/commit_write, if from==to==start of block, we 742 ** skip the prepare. make sure we never send an offset for the start 743 ** of a block 744 */ 745 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 746 offset++; 747 } 748 index = size >> PAGE_CACHE_SHIFT; 749 750 page = grab_cache_page(mapping, index); 751 if (!page) { 752 ret = -ENOMEM; 753 mlog_errno(ret); 754 goto out; 755 } 756 757 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 758 if (ret < 0) { 759 mlog_errno(ret); 760 goto out_unlock; 761 } 762 763 if (ocfs2_should_order_data(inode)) { 764 handle = ocfs2_start_walk_page_trans(inode, page, offset, 765 offset); 766 if (IS_ERR(handle)) { 767 ret = PTR_ERR(handle); 768 handle = NULL; 769 goto out_unlock; 770 } 771 } 772 773 /* must not update i_size! */ 774 ret = block_commit_write(page, offset, offset); 775 if (ret < 0) 776 mlog_errno(ret); 777 else 778 ret = 0; 779 780 if (handle) 781 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 782 out_unlock: 783 unlock_page(page); 784 page_cache_release(page); 785 out: 786 return ret; 787 } 788 789 static int ocfs2_zero_extend(struct inode *inode, 790 u64 zero_to_size) 791 { 792 int ret = 0; 793 u64 start_off; 794 struct super_block *sb = inode->i_sb; 795 796 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 797 while (start_off < zero_to_size) { 798 ret = ocfs2_write_zero_page(inode, start_off); 799 if (ret < 0) { 800 mlog_errno(ret); 801 goto out; 802 } 803 804 start_off += sb->s_blocksize; 805 806 /* 807 * Very large extends have the potential to lock up 808 * the cpu for extended periods of time. 809 */ 810 cond_resched(); 811 } 812 813 out: 814 return ret; 815 } 816 817 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 818 { 819 int ret; 820 u32 clusters_to_add; 821 struct ocfs2_inode_info *oi = OCFS2_I(inode); 822 823 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 824 if (clusters_to_add < oi->ip_clusters) 825 clusters_to_add = 0; 826 else 827 clusters_to_add -= oi->ip_clusters; 828 829 if (clusters_to_add) { 830 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, 831 clusters_to_add, 0); 832 if (ret) { 833 mlog_errno(ret); 834 goto out; 835 } 836 } 837 838 /* 839 * Call this even if we don't add any clusters to the tree. We 840 * still need to zero the area between the old i_size and the 841 * new i_size. 842 */ 843 ret = ocfs2_zero_extend(inode, zero_to); 844 if (ret < 0) 845 mlog_errno(ret); 846 847 out: 848 return ret; 849 } 850 851 static int ocfs2_extend_file(struct inode *inode, 852 struct buffer_head *di_bh, 853 u64 new_i_size) 854 { 855 int ret = 0; 856 struct ocfs2_inode_info *oi = OCFS2_I(inode); 857 858 BUG_ON(!di_bh); 859 860 /* setattr sometimes calls us like this. */ 861 if (new_i_size == 0) 862 goto out; 863 864 if (i_size_read(inode) == new_i_size) 865 goto out; 866 BUG_ON(new_i_size < i_size_read(inode)); 867 868 /* 869 * Fall through for converting inline data, even if the fs 870 * supports sparse files. 871 * 872 * The check for inline data here is legal - nobody can add 873 * the feature since we have i_mutex. We must check it again 874 * after acquiring ip_alloc_sem though, as paths like mmap 875 * might have raced us to converting the inode to extents. 876 */ 877 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 878 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 879 goto out_update_size; 880 881 /* 882 * The alloc sem blocks people in read/write from reading our 883 * allocation until we're done changing it. We depend on 884 * i_mutex to block other extend/truncate calls while we're 885 * here. 886 */ 887 down_write(&oi->ip_alloc_sem); 888 889 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 890 /* 891 * We can optimize small extends by keeping the inodes 892 * inline data. 893 */ 894 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { 895 up_write(&oi->ip_alloc_sem); 896 goto out_update_size; 897 } 898 899 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 900 if (ret) { 901 up_write(&oi->ip_alloc_sem); 902 903 mlog_errno(ret); 904 goto out; 905 } 906 } 907 908 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 909 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 910 911 up_write(&oi->ip_alloc_sem); 912 913 if (ret < 0) { 914 mlog_errno(ret); 915 goto out; 916 } 917 918 out_update_size: 919 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 920 if (ret < 0) 921 mlog_errno(ret); 922 923 out: 924 return ret; 925 } 926 927 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 928 { 929 int status = 0, size_change; 930 struct inode *inode = dentry->d_inode; 931 struct super_block *sb = inode->i_sb; 932 struct ocfs2_super *osb = OCFS2_SB(sb); 933 struct buffer_head *bh = NULL; 934 handle_t *handle = NULL; 935 struct dquot *transfer_to[MAXQUOTAS] = { }; 936 int qtype; 937 938 mlog_entry("(0x%p, '%.*s')\n", dentry, 939 dentry->d_name.len, dentry->d_name.name); 940 941 /* ensuring we don't even attempt to truncate a symlink */ 942 if (S_ISLNK(inode->i_mode)) 943 attr->ia_valid &= ~ATTR_SIZE; 944 945 if (attr->ia_valid & ATTR_MODE) 946 mlog(0, "mode change: %d\n", attr->ia_mode); 947 if (attr->ia_valid & ATTR_UID) 948 mlog(0, "uid change: %d\n", attr->ia_uid); 949 if (attr->ia_valid & ATTR_GID) 950 mlog(0, "gid change: %d\n", attr->ia_gid); 951 if (attr->ia_valid & ATTR_SIZE) 952 mlog(0, "size change...\n"); 953 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 954 mlog(0, "time change...\n"); 955 956 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 957 | ATTR_GID | ATTR_UID | ATTR_MODE) 958 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 959 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 960 return 0; 961 } 962 963 status = inode_change_ok(inode, attr); 964 if (status) 965 return status; 966 967 if (is_quota_modification(inode, attr)) 968 dquot_initialize(inode); 969 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 970 if (size_change) { 971 status = ocfs2_rw_lock(inode, 1); 972 if (status < 0) { 973 mlog_errno(status); 974 goto bail; 975 } 976 } 977 978 status = ocfs2_inode_lock(inode, &bh, 1); 979 if (status < 0) { 980 if (status != -ENOENT) 981 mlog_errno(status); 982 goto bail_unlock_rw; 983 } 984 985 if (size_change && attr->ia_size != i_size_read(inode)) { 986 status = inode_newsize_ok(inode, attr->ia_size); 987 if (status) 988 goto bail_unlock; 989 990 if (i_size_read(inode) > attr->ia_size) { 991 if (ocfs2_should_order_data(inode)) { 992 status = ocfs2_begin_ordered_truncate(inode, 993 attr->ia_size); 994 if (status) 995 goto bail_unlock; 996 } 997 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 998 } else 999 status = ocfs2_extend_file(inode, bh, attr->ia_size); 1000 if (status < 0) { 1001 if (status != -ENOSPC) 1002 mlog_errno(status); 1003 status = -ENOSPC; 1004 goto bail_unlock; 1005 } 1006 } 1007 1008 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 1009 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 1010 /* 1011 * Gather pointers to quota structures so that allocation / 1012 * freeing of quota structures happens here and not inside 1013 * dquot_transfer() where we have problems with lock ordering 1014 */ 1015 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1016 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1017 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { 1018 transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, 1019 USRQUOTA); 1020 if (!transfer_to[USRQUOTA]) { 1021 status = -ESRCH; 1022 goto bail_unlock; 1023 } 1024 } 1025 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid 1026 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1027 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { 1028 transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, 1029 GRPQUOTA); 1030 if (!transfer_to[GRPQUOTA]) { 1031 status = -ESRCH; 1032 goto bail_unlock; 1033 } 1034 } 1035 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 1036 2 * ocfs2_quota_trans_credits(sb)); 1037 if (IS_ERR(handle)) { 1038 status = PTR_ERR(handle); 1039 mlog_errno(status); 1040 goto bail_unlock; 1041 } 1042 status = __dquot_transfer(inode, transfer_to); 1043 if (status < 0) 1044 goto bail_commit; 1045 } else { 1046 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1047 if (IS_ERR(handle)) { 1048 status = PTR_ERR(handle); 1049 mlog_errno(status); 1050 goto bail_unlock; 1051 } 1052 } 1053 1054 /* 1055 * This will intentionally not wind up calling simple_setsize(), 1056 * since all the work for a size change has been done above. 1057 * Otherwise, we could get into problems with truncate as 1058 * ip_alloc_sem is used there to protect against i_size 1059 * changes. 1060 */ 1061 status = inode_setattr(inode, attr); 1062 if (status < 0) { 1063 mlog_errno(status); 1064 goto bail_commit; 1065 } 1066 1067 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1068 if (status < 0) 1069 mlog_errno(status); 1070 1071 bail_commit: 1072 ocfs2_commit_trans(osb, handle); 1073 bail_unlock: 1074 ocfs2_inode_unlock(inode, 1); 1075 bail_unlock_rw: 1076 if (size_change) 1077 ocfs2_rw_unlock(inode, 1); 1078 bail: 1079 brelse(bh); 1080 1081 /* Release quota pointers in case we acquired them */ 1082 for (qtype = 0; qtype < MAXQUOTAS; qtype++) 1083 dqput(transfer_to[qtype]); 1084 1085 if (!status && attr->ia_valid & ATTR_MODE) { 1086 status = ocfs2_acl_chmod(inode); 1087 if (status < 0) 1088 mlog_errno(status); 1089 } 1090 1091 mlog_exit(status); 1092 return status; 1093 } 1094 1095 int ocfs2_getattr(struct vfsmount *mnt, 1096 struct dentry *dentry, 1097 struct kstat *stat) 1098 { 1099 struct inode *inode = dentry->d_inode; 1100 struct super_block *sb = dentry->d_inode->i_sb; 1101 struct ocfs2_super *osb = sb->s_fs_info; 1102 int err; 1103 1104 mlog_entry_void(); 1105 1106 err = ocfs2_inode_revalidate(dentry); 1107 if (err) { 1108 if (err != -ENOENT) 1109 mlog_errno(err); 1110 goto bail; 1111 } 1112 1113 generic_fillattr(inode, stat); 1114 1115 /* We set the blksize from the cluster size for performance */ 1116 stat->blksize = osb->s_clustersize; 1117 1118 bail: 1119 mlog_exit(err); 1120 1121 return err; 1122 } 1123 1124 int ocfs2_permission(struct inode *inode, int mask) 1125 { 1126 int ret; 1127 1128 mlog_entry_void(); 1129 1130 ret = ocfs2_inode_lock(inode, NULL, 0); 1131 if (ret) { 1132 if (ret != -ENOENT) 1133 mlog_errno(ret); 1134 goto out; 1135 } 1136 1137 ret = generic_permission(inode, mask, ocfs2_check_acl); 1138 1139 ocfs2_inode_unlock(inode, 0); 1140 out: 1141 mlog_exit(ret); 1142 return ret; 1143 } 1144 1145 static int __ocfs2_write_remove_suid(struct inode *inode, 1146 struct buffer_head *bh) 1147 { 1148 int ret; 1149 handle_t *handle; 1150 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1151 struct ocfs2_dinode *di; 1152 1153 mlog_entry("(Inode %llu, mode 0%o)\n", 1154 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); 1155 1156 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1157 if (IS_ERR(handle)) { 1158 ret = PTR_ERR(handle); 1159 mlog_errno(ret); 1160 goto out; 1161 } 1162 1163 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1164 OCFS2_JOURNAL_ACCESS_WRITE); 1165 if (ret < 0) { 1166 mlog_errno(ret); 1167 goto out_trans; 1168 } 1169 1170 inode->i_mode &= ~S_ISUID; 1171 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1172 inode->i_mode &= ~S_ISGID; 1173 1174 di = (struct ocfs2_dinode *) bh->b_data; 1175 di->i_mode = cpu_to_le16(inode->i_mode); 1176 1177 ocfs2_journal_dirty(handle, bh); 1178 1179 out_trans: 1180 ocfs2_commit_trans(osb, handle); 1181 out: 1182 mlog_exit(ret); 1183 return ret; 1184 } 1185 1186 /* 1187 * Will look for holes and unwritten extents in the range starting at 1188 * pos for count bytes (inclusive). 1189 */ 1190 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1191 size_t count) 1192 { 1193 int ret = 0; 1194 unsigned int extent_flags; 1195 u32 cpos, clusters, extent_len, phys_cpos; 1196 struct super_block *sb = inode->i_sb; 1197 1198 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1199 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1200 1201 while (clusters) { 1202 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1203 &extent_flags); 1204 if (ret < 0) { 1205 mlog_errno(ret); 1206 goto out; 1207 } 1208 1209 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1210 ret = 1; 1211 break; 1212 } 1213 1214 if (extent_len > clusters) 1215 extent_len = clusters; 1216 1217 clusters -= extent_len; 1218 cpos += extent_len; 1219 } 1220 out: 1221 return ret; 1222 } 1223 1224 static int ocfs2_write_remove_suid(struct inode *inode) 1225 { 1226 int ret; 1227 struct buffer_head *bh = NULL; 1228 1229 ret = ocfs2_read_inode_block(inode, &bh); 1230 if (ret < 0) { 1231 mlog_errno(ret); 1232 goto out; 1233 } 1234 1235 ret = __ocfs2_write_remove_suid(inode, bh); 1236 out: 1237 brelse(bh); 1238 return ret; 1239 } 1240 1241 /* 1242 * Allocate enough extents to cover the region starting at byte offset 1243 * start for len bytes. Existing extents are skipped, any extents 1244 * added are marked as "unwritten". 1245 */ 1246 static int ocfs2_allocate_unwritten_extents(struct inode *inode, 1247 u64 start, u64 len) 1248 { 1249 int ret; 1250 u32 cpos, phys_cpos, clusters, alloc_size; 1251 u64 end = start + len; 1252 struct buffer_head *di_bh = NULL; 1253 1254 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1255 ret = ocfs2_read_inode_block(inode, &di_bh); 1256 if (ret) { 1257 mlog_errno(ret); 1258 goto out; 1259 } 1260 1261 /* 1262 * Nothing to do if the requested reservation range 1263 * fits within the inode. 1264 */ 1265 if (ocfs2_size_fits_inline_data(di_bh, end)) 1266 goto out; 1267 1268 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1269 if (ret) { 1270 mlog_errno(ret); 1271 goto out; 1272 } 1273 } 1274 1275 /* 1276 * We consider both start and len to be inclusive. 1277 */ 1278 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1279 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); 1280 clusters -= cpos; 1281 1282 while (clusters) { 1283 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, 1284 &alloc_size, NULL); 1285 if (ret) { 1286 mlog_errno(ret); 1287 goto out; 1288 } 1289 1290 /* 1291 * Hole or existing extent len can be arbitrary, so 1292 * cap it to our own allocation request. 1293 */ 1294 if (alloc_size > clusters) 1295 alloc_size = clusters; 1296 1297 if (phys_cpos) { 1298 /* 1299 * We already have an allocation at this 1300 * region so we can safely skip it. 1301 */ 1302 goto next; 1303 } 1304 1305 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); 1306 if (ret) { 1307 if (ret != -ENOSPC) 1308 mlog_errno(ret); 1309 goto out; 1310 } 1311 1312 next: 1313 cpos += alloc_size; 1314 clusters -= alloc_size; 1315 } 1316 1317 ret = 0; 1318 out: 1319 1320 brelse(di_bh); 1321 return ret; 1322 } 1323 1324 /* 1325 * Truncate a byte range, avoiding pages within partial clusters. This 1326 * preserves those pages for the zeroing code to write to. 1327 */ 1328 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, 1329 u64 byte_len) 1330 { 1331 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1332 loff_t start, end; 1333 struct address_space *mapping = inode->i_mapping; 1334 1335 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); 1336 end = byte_start + byte_len; 1337 end = end & ~(osb->s_clustersize - 1); 1338 1339 if (start < end) { 1340 unmap_mapping_range(mapping, start, end - start, 0); 1341 truncate_inode_pages_range(mapping, start, end - 1); 1342 } 1343 } 1344 1345 static int ocfs2_zero_partial_clusters(struct inode *inode, 1346 u64 start, u64 len) 1347 { 1348 int ret = 0; 1349 u64 tmpend, end = start + len; 1350 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1351 unsigned int csize = osb->s_clustersize; 1352 handle_t *handle; 1353 1354 /* 1355 * The "start" and "end" values are NOT necessarily part of 1356 * the range whose allocation is being deleted. Rather, this 1357 * is what the user passed in with the request. We must zero 1358 * partial clusters here. There's no need to worry about 1359 * physical allocation - the zeroing code knows to skip holes. 1360 */ 1361 mlog(0, "byte start: %llu, end: %llu\n", 1362 (unsigned long long)start, (unsigned long long)end); 1363 1364 /* 1365 * If both edges are on a cluster boundary then there's no 1366 * zeroing required as the region is part of the allocation to 1367 * be truncated. 1368 */ 1369 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) 1370 goto out; 1371 1372 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1373 if (IS_ERR(handle)) { 1374 ret = PTR_ERR(handle); 1375 mlog_errno(ret); 1376 goto out; 1377 } 1378 1379 /* 1380 * We want to get the byte offset of the end of the 1st cluster. 1381 */ 1382 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1383 if (tmpend > end) 1384 tmpend = end; 1385 1386 mlog(0, "1st range: start: %llu, tmpend: %llu\n", 1387 (unsigned long long)start, (unsigned long long)tmpend); 1388 1389 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1390 if (ret) 1391 mlog_errno(ret); 1392 1393 if (tmpend < end) { 1394 /* 1395 * This may make start and end equal, but the zeroing 1396 * code will skip any work in that case so there's no 1397 * need to catch it up here. 1398 */ 1399 start = end & ~(osb->s_clustersize - 1); 1400 1401 mlog(0, "2nd range: start: %llu, end: %llu\n", 1402 (unsigned long long)start, (unsigned long long)end); 1403 1404 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1405 if (ret) 1406 mlog_errno(ret); 1407 } 1408 1409 ocfs2_commit_trans(osb, handle); 1410 out: 1411 return ret; 1412 } 1413 1414 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) 1415 { 1416 int i; 1417 struct ocfs2_extent_rec *rec = NULL; 1418 1419 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 1420 1421 rec = &el->l_recs[i]; 1422 1423 if (le32_to_cpu(rec->e_cpos) < pos) 1424 break; 1425 } 1426 1427 return i; 1428 } 1429 1430 /* 1431 * Helper to calculate the punching pos and length in one run, we handle the 1432 * following three cases in order: 1433 * 1434 * - remove the entire record 1435 * - remove a partial record 1436 * - no record needs to be removed (hole-punching completed) 1437 */ 1438 static void ocfs2_calc_trunc_pos(struct inode *inode, 1439 struct ocfs2_extent_list *el, 1440 struct ocfs2_extent_rec *rec, 1441 u32 trunc_start, u32 *trunc_cpos, 1442 u32 *trunc_len, u32 *trunc_end, 1443 u64 *blkno, int *done) 1444 { 1445 int ret = 0; 1446 u32 coff, range; 1447 1448 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 1449 1450 if (le32_to_cpu(rec->e_cpos) >= trunc_start) { 1451 *trunc_cpos = le32_to_cpu(rec->e_cpos); 1452 /* 1453 * Skip holes if any. 1454 */ 1455 if (range < *trunc_end) 1456 *trunc_end = range; 1457 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); 1458 *blkno = le64_to_cpu(rec->e_blkno); 1459 *trunc_end = le32_to_cpu(rec->e_cpos); 1460 } else if (range > trunc_start) { 1461 *trunc_cpos = trunc_start; 1462 *trunc_len = *trunc_end - trunc_start; 1463 coff = trunc_start - le32_to_cpu(rec->e_cpos); 1464 *blkno = le64_to_cpu(rec->e_blkno) + 1465 ocfs2_clusters_to_blocks(inode->i_sb, coff); 1466 *trunc_end = trunc_start; 1467 } else { 1468 /* 1469 * It may have two following possibilities: 1470 * 1471 * - last record has been removed 1472 * - trunc_start was within a hole 1473 * 1474 * both two cases mean the completion of hole punching. 1475 */ 1476 ret = 1; 1477 } 1478 1479 *done = ret; 1480 } 1481 1482 static int ocfs2_remove_inode_range(struct inode *inode, 1483 struct buffer_head *di_bh, u64 byte_start, 1484 u64 byte_len) 1485 { 1486 int ret = 0, flags = 0, done = 0, i; 1487 u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; 1488 u32 cluster_in_el; 1489 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1490 struct ocfs2_cached_dealloc_ctxt dealloc; 1491 struct address_space *mapping = inode->i_mapping; 1492 struct ocfs2_extent_tree et; 1493 struct ocfs2_path *path = NULL; 1494 struct ocfs2_extent_list *el = NULL; 1495 struct ocfs2_extent_rec *rec = NULL; 1496 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1497 u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); 1498 1499 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1500 ocfs2_init_dealloc_ctxt(&dealloc); 1501 1502 if (byte_len == 0) 1503 return 0; 1504 1505 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1506 ret = ocfs2_truncate_inline(inode, di_bh, byte_start, 1507 byte_start + byte_len, 0); 1508 if (ret) { 1509 mlog_errno(ret); 1510 goto out; 1511 } 1512 /* 1513 * There's no need to get fancy with the page cache 1514 * truncate of an inline-data inode. We're talking 1515 * about less than a page here, which will be cached 1516 * in the dinode buffer anyway. 1517 */ 1518 unmap_mapping_range(mapping, 0, 0, 0); 1519 truncate_inode_pages(mapping, 0); 1520 goto out; 1521 } 1522 1523 /* 1524 * For reflinks, we may need to CoW 2 clusters which might be 1525 * partially zero'd later, if hole's start and end offset were 1526 * within one cluster(means is not exactly aligned to clustersize). 1527 */ 1528 1529 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { 1530 1531 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); 1532 if (ret) { 1533 mlog_errno(ret); 1534 goto out; 1535 } 1536 1537 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); 1538 if (ret) { 1539 mlog_errno(ret); 1540 goto out; 1541 } 1542 } 1543 1544 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1545 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; 1546 cluster_in_el = trunc_end; 1547 1548 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n", 1549 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1550 (unsigned long long)byte_start, 1551 (unsigned long long)byte_len, trunc_start, trunc_end); 1552 1553 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1554 if (ret) { 1555 mlog_errno(ret); 1556 goto out; 1557 } 1558 1559 path = ocfs2_new_path_from_et(&et); 1560 if (!path) { 1561 ret = -ENOMEM; 1562 mlog_errno(ret); 1563 goto out; 1564 } 1565 1566 while (trunc_end > trunc_start) { 1567 1568 ret = ocfs2_find_path(INODE_CACHE(inode), path, 1569 cluster_in_el); 1570 if (ret) { 1571 mlog_errno(ret); 1572 goto out; 1573 } 1574 1575 el = path_leaf_el(path); 1576 1577 i = ocfs2_find_rec(el, trunc_end); 1578 /* 1579 * Need to go to previous extent block. 1580 */ 1581 if (i < 0) { 1582 if (path->p_tree_depth == 0) 1583 break; 1584 1585 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 1586 path, 1587 &cluster_in_el); 1588 if (ret) { 1589 mlog_errno(ret); 1590 goto out; 1591 } 1592 1593 /* 1594 * We've reached the leftmost extent block, 1595 * it's safe to leave. 1596 */ 1597 if (cluster_in_el == 0) 1598 break; 1599 1600 /* 1601 * The 'pos' searched for previous extent block is 1602 * always one cluster less than actual trunc_end. 1603 */ 1604 trunc_end = cluster_in_el + 1; 1605 1606 ocfs2_reinit_path(path, 1); 1607 1608 continue; 1609 1610 } else 1611 rec = &el->l_recs[i]; 1612 1613 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, 1614 &trunc_len, &trunc_end, &blkno, &done); 1615 if (done) 1616 break; 1617 1618 flags = rec->e_flags; 1619 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 1620 1621 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, 1622 phys_cpos, trunc_len, flags, 1623 &dealloc, refcount_loc); 1624 if (ret < 0) { 1625 mlog_errno(ret); 1626 goto out; 1627 } 1628 1629 cluster_in_el = trunc_end; 1630 1631 ocfs2_reinit_path(path, 1); 1632 } 1633 1634 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); 1635 1636 out: 1637 ocfs2_schedule_truncate_log_flush(osb, 1); 1638 ocfs2_run_deallocs(osb, &dealloc); 1639 1640 return ret; 1641 } 1642 1643 /* 1644 * Parts of this function taken from xfs_change_file_space() 1645 */ 1646 static int __ocfs2_change_file_space(struct file *file, struct inode *inode, 1647 loff_t f_pos, unsigned int cmd, 1648 struct ocfs2_space_resv *sr, 1649 int change_size) 1650 { 1651 int ret; 1652 s64 llen; 1653 loff_t size; 1654 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1655 struct buffer_head *di_bh = NULL; 1656 handle_t *handle; 1657 unsigned long long max_off = inode->i_sb->s_maxbytes; 1658 1659 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 1660 return -EROFS; 1661 1662 mutex_lock(&inode->i_mutex); 1663 1664 /* 1665 * This prevents concurrent writes on other nodes 1666 */ 1667 ret = ocfs2_rw_lock(inode, 1); 1668 if (ret) { 1669 mlog_errno(ret); 1670 goto out; 1671 } 1672 1673 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1674 if (ret) { 1675 mlog_errno(ret); 1676 goto out_rw_unlock; 1677 } 1678 1679 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1680 ret = -EPERM; 1681 goto out_inode_unlock; 1682 } 1683 1684 switch (sr->l_whence) { 1685 case 0: /*SEEK_SET*/ 1686 break; 1687 case 1: /*SEEK_CUR*/ 1688 sr->l_start += f_pos; 1689 break; 1690 case 2: /*SEEK_END*/ 1691 sr->l_start += i_size_read(inode); 1692 break; 1693 default: 1694 ret = -EINVAL; 1695 goto out_inode_unlock; 1696 } 1697 sr->l_whence = 0; 1698 1699 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; 1700 1701 if (sr->l_start < 0 1702 || sr->l_start > max_off 1703 || (sr->l_start + llen) < 0 1704 || (sr->l_start + llen) > max_off) { 1705 ret = -EINVAL; 1706 goto out_inode_unlock; 1707 } 1708 size = sr->l_start + sr->l_len; 1709 1710 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1711 if (sr->l_len <= 0) { 1712 ret = -EINVAL; 1713 goto out_inode_unlock; 1714 } 1715 } 1716 1717 if (file && should_remove_suid(file->f_path.dentry)) { 1718 ret = __ocfs2_write_remove_suid(inode, di_bh); 1719 if (ret) { 1720 mlog_errno(ret); 1721 goto out_inode_unlock; 1722 } 1723 } 1724 1725 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1726 switch (cmd) { 1727 case OCFS2_IOC_RESVSP: 1728 case OCFS2_IOC_RESVSP64: 1729 /* 1730 * This takes unsigned offsets, but the signed ones we 1731 * pass have been checked against overflow above. 1732 */ 1733 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, 1734 sr->l_len); 1735 break; 1736 case OCFS2_IOC_UNRESVSP: 1737 case OCFS2_IOC_UNRESVSP64: 1738 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, 1739 sr->l_len); 1740 break; 1741 default: 1742 ret = -EINVAL; 1743 } 1744 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1745 if (ret) { 1746 mlog_errno(ret); 1747 goto out_inode_unlock; 1748 } 1749 1750 /* 1751 * We update c/mtime for these changes 1752 */ 1753 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1754 if (IS_ERR(handle)) { 1755 ret = PTR_ERR(handle); 1756 mlog_errno(ret); 1757 goto out_inode_unlock; 1758 } 1759 1760 if (change_size && i_size_read(inode) < size) 1761 i_size_write(inode, size); 1762 1763 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1764 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); 1765 if (ret < 0) 1766 mlog_errno(ret); 1767 1768 ocfs2_commit_trans(osb, handle); 1769 1770 out_inode_unlock: 1771 brelse(di_bh); 1772 ocfs2_inode_unlock(inode, 1); 1773 out_rw_unlock: 1774 ocfs2_rw_unlock(inode, 1); 1775 1776 out: 1777 mutex_unlock(&inode->i_mutex); 1778 return ret; 1779 } 1780 1781 int ocfs2_change_file_space(struct file *file, unsigned int cmd, 1782 struct ocfs2_space_resv *sr) 1783 { 1784 struct inode *inode = file->f_path.dentry->d_inode; 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1786 1787 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1788 !ocfs2_writes_unwritten_extents(osb)) 1789 return -ENOTTY; 1790 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && 1791 !ocfs2_sparse_alloc(osb)) 1792 return -ENOTTY; 1793 1794 if (!S_ISREG(inode->i_mode)) 1795 return -EINVAL; 1796 1797 if (!(file->f_mode & FMODE_WRITE)) 1798 return -EBADF; 1799 1800 return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); 1801 } 1802 1803 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, 1804 loff_t len) 1805 { 1806 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1807 struct ocfs2_space_resv sr; 1808 int change_size = 1; 1809 1810 if (!ocfs2_writes_unwritten_extents(osb)) 1811 return -EOPNOTSUPP; 1812 1813 if (S_ISDIR(inode->i_mode)) 1814 return -ENODEV; 1815 1816 if (mode & FALLOC_FL_KEEP_SIZE) 1817 change_size = 0; 1818 1819 sr.l_whence = 0; 1820 sr.l_start = (s64)offset; 1821 sr.l_len = (s64)len; 1822 1823 return __ocfs2_change_file_space(NULL, inode, offset, 1824 OCFS2_IOC_RESVSP64, &sr, change_size); 1825 } 1826 1827 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, 1828 size_t count) 1829 { 1830 int ret = 0; 1831 unsigned int extent_flags; 1832 u32 cpos, clusters, extent_len, phys_cpos; 1833 struct super_block *sb = inode->i_sb; 1834 1835 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || 1836 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || 1837 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1838 return 0; 1839 1840 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1841 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1842 1843 while (clusters) { 1844 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1845 &extent_flags); 1846 if (ret < 0) { 1847 mlog_errno(ret); 1848 goto out; 1849 } 1850 1851 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { 1852 ret = 1; 1853 break; 1854 } 1855 1856 if (extent_len > clusters) 1857 extent_len = clusters; 1858 1859 clusters -= extent_len; 1860 cpos += extent_len; 1861 } 1862 out: 1863 return ret; 1864 } 1865 1866 static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 1867 loff_t pos, size_t count, 1868 int *meta_level) 1869 { 1870 int ret; 1871 struct buffer_head *di_bh = NULL; 1872 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; 1873 u32 clusters = 1874 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; 1875 1876 ret = ocfs2_inode_lock(inode, &di_bh, 1); 1877 if (ret) { 1878 mlog_errno(ret); 1879 goto out; 1880 } 1881 1882 *meta_level = 1; 1883 1884 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 1885 if (ret) 1886 mlog_errno(ret); 1887 out: 1888 brelse(di_bh); 1889 return ret; 1890 } 1891 1892 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1893 loff_t *ppos, 1894 size_t count, 1895 int appending, 1896 int *direct_io, 1897 int *has_refcount) 1898 { 1899 int ret = 0, meta_level = 0; 1900 struct inode *inode = dentry->d_inode; 1901 loff_t saved_pos, end; 1902 1903 /* 1904 * We start with a read level meta lock and only jump to an ex 1905 * if we need to make modifications here. 1906 */ 1907 for(;;) { 1908 ret = ocfs2_inode_lock(inode, NULL, meta_level); 1909 if (ret < 0) { 1910 meta_level = -1; 1911 mlog_errno(ret); 1912 goto out; 1913 } 1914 1915 /* Clear suid / sgid if necessary. We do this here 1916 * instead of later in the write path because 1917 * remove_suid() calls ->setattr without any hint that 1918 * we may have already done our cluster locking. Since 1919 * ocfs2_setattr() *must* take cluster locks to 1920 * proceeed, this will lead us to recursively lock the 1921 * inode. There's also the dinode i_size state which 1922 * can be lost via setattr during extending writes (we 1923 * set inode->i_size at the end of a write. */ 1924 if (should_remove_suid(dentry)) { 1925 if (meta_level == 0) { 1926 ocfs2_inode_unlock(inode, meta_level); 1927 meta_level = 1; 1928 continue; 1929 } 1930 1931 ret = ocfs2_write_remove_suid(inode); 1932 if (ret < 0) { 1933 mlog_errno(ret); 1934 goto out_unlock; 1935 } 1936 } 1937 1938 /* work on a copy of ppos until we're sure that we won't have 1939 * to recalculate it due to relocking. */ 1940 if (appending) { 1941 saved_pos = i_size_read(inode); 1942 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1943 } else { 1944 saved_pos = *ppos; 1945 } 1946 1947 end = saved_pos + count; 1948 1949 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); 1950 if (ret == 1) { 1951 ocfs2_inode_unlock(inode, meta_level); 1952 meta_level = -1; 1953 1954 ret = ocfs2_prepare_inode_for_refcount(inode, 1955 saved_pos, 1956 count, 1957 &meta_level); 1958 if (has_refcount) 1959 *has_refcount = 1; 1960 if (direct_io) 1961 *direct_io = 0; 1962 } 1963 1964 if (ret < 0) { 1965 mlog_errno(ret); 1966 goto out_unlock; 1967 } 1968 1969 /* 1970 * Skip the O_DIRECT checks if we don't need 1971 * them. 1972 */ 1973 if (!direct_io || !(*direct_io)) 1974 break; 1975 1976 /* 1977 * There's no sane way to do direct writes to an inode 1978 * with inline data. 1979 */ 1980 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1981 *direct_io = 0; 1982 break; 1983 } 1984 1985 /* 1986 * Allowing concurrent direct writes means 1987 * i_size changes wouldn't be synchronized, so 1988 * one node could wind up truncating another 1989 * nodes writes. 1990 */ 1991 if (end > i_size_read(inode)) { 1992 *direct_io = 0; 1993 break; 1994 } 1995 1996 /* 1997 * We don't fill holes during direct io, so 1998 * check for them here. If any are found, the 1999 * caller will have to retake some cluster 2000 * locks and initiate the io as buffered. 2001 */ 2002 ret = ocfs2_check_range_for_holes(inode, saved_pos, count); 2003 if (ret == 1) { 2004 *direct_io = 0; 2005 ret = 0; 2006 } else if (ret < 0) 2007 mlog_errno(ret); 2008 break; 2009 } 2010 2011 if (appending) 2012 *ppos = saved_pos; 2013 2014 out_unlock: 2015 if (meta_level >= 0) 2016 ocfs2_inode_unlock(inode, meta_level); 2017 2018 out: 2019 return ret; 2020 } 2021 2022 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 2023 const struct iovec *iov, 2024 unsigned long nr_segs, 2025 loff_t pos) 2026 { 2027 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 2028 int can_do_direct, has_refcount = 0; 2029 ssize_t written = 0; 2030 size_t ocount; /* original count */ 2031 size_t count; /* after file limit checks */ 2032 loff_t old_size, *ppos = &iocb->ki_pos; 2033 u32 old_clusters; 2034 struct file *file = iocb->ki_filp; 2035 struct inode *inode = file->f_path.dentry->d_inode; 2036 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2037 2038 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2039 (unsigned int)nr_segs, 2040 file->f_path.dentry->d_name.len, 2041 file->f_path.dentry->d_name.name); 2042 2043 if (iocb->ki_left == 0) 2044 return 0; 2045 2046 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2047 2048 appending = file->f_flags & O_APPEND ? 1 : 0; 2049 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 2050 2051 mutex_lock(&inode->i_mutex); 2052 2053 relock: 2054 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2055 if (direct_io) { 2056 down_read(&inode->i_alloc_sem); 2057 have_alloc_sem = 1; 2058 } 2059 2060 /* concurrent O_DIRECT writes are allowed */ 2061 rw_level = !direct_io; 2062 ret = ocfs2_rw_lock(inode, rw_level); 2063 if (ret < 0) { 2064 mlog_errno(ret); 2065 goto out_sems; 2066 } 2067 2068 can_do_direct = direct_io; 2069 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 2070 iocb->ki_left, appending, 2071 &can_do_direct, &has_refcount); 2072 if (ret < 0) { 2073 mlog_errno(ret); 2074 goto out; 2075 } 2076 2077 /* 2078 * We can't complete the direct I/O as requested, fall back to 2079 * buffered I/O. 2080 */ 2081 if (direct_io && !can_do_direct) { 2082 ocfs2_rw_unlock(inode, rw_level); 2083 up_read(&inode->i_alloc_sem); 2084 2085 have_alloc_sem = 0; 2086 rw_level = -1; 2087 2088 direct_io = 0; 2089 goto relock; 2090 } 2091 2092 /* 2093 * To later detect whether a journal commit for sync writes is 2094 * necessary, we sample i_size, and cluster count here. 2095 */ 2096 old_size = i_size_read(inode); 2097 old_clusters = OCFS2_I(inode)->ip_clusters; 2098 2099 /* communicate with ocfs2_dio_end_io */ 2100 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2101 2102 ret = generic_segment_checks(iov, &nr_segs, &ocount, 2103 VERIFY_READ); 2104 if (ret) 2105 goto out_dio; 2106 2107 count = ocount; 2108 ret = generic_write_checks(file, ppos, &count, 2109 S_ISBLK(inode->i_mode)); 2110 if (ret) 2111 goto out_dio; 2112 2113 if (direct_io) { 2114 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 2115 ppos, count, ocount); 2116 if (written < 0) { 2117 /* 2118 * direct write may have instantiated a few 2119 * blocks outside i_size. Trim these off again. 2120 * Don't need i_size_read because we hold i_mutex. 2121 * 2122 * XXX(hch): this looks buggy because ocfs2 did not 2123 * actually implement ->truncate. Take a look at 2124 * the new truncate sequence and update this accordingly 2125 */ 2126 if (*ppos + count > inode->i_size) 2127 simple_setsize(inode, inode->i_size); 2128 ret = written; 2129 goto out_dio; 2130 } 2131 } else { 2132 current->backing_dev_info = file->f_mapping->backing_dev_info; 2133 written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, 2134 ppos, count, 0); 2135 current->backing_dev_info = NULL; 2136 } 2137 2138 out_dio: 2139 /* buffered aio wouldn't have proper lock coverage today */ 2140 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2141 2142 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2143 ((file->f_flags & O_DIRECT) && has_refcount)) { 2144 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2145 pos + count - 1); 2146 if (ret < 0) 2147 written = ret; 2148 2149 if (!ret && ((old_size != i_size_read(inode)) || 2150 (old_clusters != OCFS2_I(inode)->ip_clusters) || 2151 has_refcount)) { 2152 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2153 if (ret < 0) 2154 written = ret; 2155 } 2156 2157 if (!ret) 2158 ret = filemap_fdatawait_range(file->f_mapping, pos, 2159 pos + count - 1); 2160 } 2161 2162 /* 2163 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2164 * function pointer which is called when o_direct io completes so that 2165 * it can unlock our rw lock. (it's the clustered equivalent of 2166 * i_alloc_sem; protects truncate from racing with pending ios). 2167 * Unfortunately there are error cases which call end_io and others 2168 * that don't. so we don't have to unlock the rw_lock if either an 2169 * async dio is going to do it in the future or an end_io after an 2170 * error has already done it. 2171 */ 2172 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2173 rw_level = -1; 2174 have_alloc_sem = 0; 2175 } 2176 2177 out: 2178 if (rw_level != -1) 2179 ocfs2_rw_unlock(inode, rw_level); 2180 2181 out_sems: 2182 if (have_alloc_sem) 2183 up_read(&inode->i_alloc_sem); 2184 2185 mutex_unlock(&inode->i_mutex); 2186 2187 if (written) 2188 ret = written; 2189 mlog_exit(ret); 2190 return ret; 2191 } 2192 2193 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, 2194 struct file *out, 2195 struct splice_desc *sd) 2196 { 2197 int ret; 2198 2199 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2200 sd->total_len, 0, NULL, NULL); 2201 if (ret < 0) { 2202 mlog_errno(ret); 2203 return ret; 2204 } 2205 2206 return splice_from_pipe_feed(pipe, sd, pipe_to_file); 2207 } 2208 2209 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2210 struct file *out, 2211 loff_t *ppos, 2212 size_t len, 2213 unsigned int flags) 2214 { 2215 int ret; 2216 struct address_space *mapping = out->f_mapping; 2217 struct inode *inode = mapping->host; 2218 struct splice_desc sd = { 2219 .total_len = len, 2220 .flags = flags, 2221 .pos = *ppos, 2222 .u.file = out, 2223 }; 2224 2225 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 2226 (unsigned int)len, 2227 out->f_path.dentry->d_name.len, 2228 out->f_path.dentry->d_name.name); 2229 2230 if (pipe->inode) 2231 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 2232 2233 splice_from_pipe_begin(&sd); 2234 do { 2235 ret = splice_from_pipe_next(pipe, &sd); 2236 if (ret <= 0) 2237 break; 2238 2239 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 2240 ret = ocfs2_rw_lock(inode, 1); 2241 if (ret < 0) 2242 mlog_errno(ret); 2243 else { 2244 ret = ocfs2_splice_to_file(pipe, out, &sd); 2245 ocfs2_rw_unlock(inode, 1); 2246 } 2247 mutex_unlock(&inode->i_mutex); 2248 } while (ret > 0); 2249 splice_from_pipe_end(pipe, &sd); 2250 2251 if (pipe->inode) 2252 mutex_unlock(&pipe->inode->i_mutex); 2253 2254 if (sd.num_spliced) 2255 ret = sd.num_spliced; 2256 2257 if (ret > 0) { 2258 unsigned long nr_pages; 2259 int err; 2260 2261 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2262 2263 err = generic_write_sync(out, *ppos, ret); 2264 if (err) 2265 ret = err; 2266 else 2267 *ppos += ret; 2268 2269 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2270 } 2271 2272 mlog_exit(ret); 2273 return ret; 2274 } 2275 2276 static ssize_t ocfs2_file_splice_read(struct file *in, 2277 loff_t *ppos, 2278 struct pipe_inode_info *pipe, 2279 size_t len, 2280 unsigned int flags) 2281 { 2282 int ret = 0, lock_level = 0; 2283 struct inode *inode = in->f_path.dentry->d_inode; 2284 2285 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 2286 (unsigned int)len, 2287 in->f_path.dentry->d_name.len, 2288 in->f_path.dentry->d_name.name); 2289 2290 /* 2291 * See the comment in ocfs2_file_aio_read() 2292 */ 2293 ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level); 2294 if (ret < 0) { 2295 mlog_errno(ret); 2296 goto bail; 2297 } 2298 ocfs2_inode_unlock(inode, lock_level); 2299 2300 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2301 2302 bail: 2303 mlog_exit(ret); 2304 return ret; 2305 } 2306 2307 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 2308 const struct iovec *iov, 2309 unsigned long nr_segs, 2310 loff_t pos) 2311 { 2312 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 2313 struct file *filp = iocb->ki_filp; 2314 struct inode *inode = filp->f_path.dentry->d_inode; 2315 2316 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 2317 (unsigned int)nr_segs, 2318 filp->f_path.dentry->d_name.len, 2319 filp->f_path.dentry->d_name.name); 2320 2321 if (!inode) { 2322 ret = -EINVAL; 2323 mlog_errno(ret); 2324 goto bail; 2325 } 2326 2327 /* 2328 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2329 * need locks to protect pending reads from racing with truncate. 2330 */ 2331 if (filp->f_flags & O_DIRECT) { 2332 down_read(&inode->i_alloc_sem); 2333 have_alloc_sem = 1; 2334 2335 ret = ocfs2_rw_lock(inode, 0); 2336 if (ret < 0) { 2337 mlog_errno(ret); 2338 goto bail; 2339 } 2340 rw_level = 0; 2341 /* communicate with ocfs2_dio_end_io */ 2342 ocfs2_iocb_set_rw_locked(iocb, rw_level); 2343 } 2344 2345 /* 2346 * We're fine letting folks race truncates and extending 2347 * writes with read across the cluster, just like they can 2348 * locally. Hence no rw_lock during read. 2349 * 2350 * Take and drop the meta data lock to update inode fields 2351 * like i_size. This allows the checks down below 2352 * generic_file_aio_read() a chance of actually working. 2353 */ 2354 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2355 if (ret < 0) { 2356 mlog_errno(ret); 2357 goto bail; 2358 } 2359 ocfs2_inode_unlock(inode, lock_level); 2360 2361 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2362 if (ret == -EINVAL) 2363 mlog(0, "generic_file_aio_read returned -EINVAL\n"); 2364 2365 /* buffered aio wouldn't have proper lock coverage today */ 2366 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2367 2368 /* see ocfs2_file_aio_write */ 2369 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2370 rw_level = -1; 2371 have_alloc_sem = 0; 2372 } 2373 2374 bail: 2375 if (have_alloc_sem) 2376 up_read(&inode->i_alloc_sem); 2377 if (rw_level != -1) 2378 ocfs2_rw_unlock(inode, rw_level); 2379 mlog_exit(ret); 2380 2381 return ret; 2382 } 2383 2384 const struct inode_operations ocfs2_file_iops = { 2385 .setattr = ocfs2_setattr, 2386 .getattr = ocfs2_getattr, 2387 .permission = ocfs2_permission, 2388 .setxattr = generic_setxattr, 2389 .getxattr = generic_getxattr, 2390 .listxattr = ocfs2_listxattr, 2391 .removexattr = generic_removexattr, 2392 .fallocate = ocfs2_fallocate, 2393 .fiemap = ocfs2_fiemap, 2394 }; 2395 2396 const struct inode_operations ocfs2_special_file_iops = { 2397 .setattr = ocfs2_setattr, 2398 .getattr = ocfs2_getattr, 2399 .permission = ocfs2_permission, 2400 }; 2401 2402 /* 2403 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with 2404 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2405 */ 2406 const struct file_operations ocfs2_fops = { 2407 .llseek = generic_file_llseek, 2408 .read = do_sync_read, 2409 .write = do_sync_write, 2410 .mmap = ocfs2_mmap, 2411 .fsync = ocfs2_sync_file, 2412 .release = ocfs2_file_release, 2413 .open = ocfs2_file_open, 2414 .aio_read = ocfs2_file_aio_read, 2415 .aio_write = ocfs2_file_aio_write, 2416 .unlocked_ioctl = ocfs2_ioctl, 2417 #ifdef CONFIG_COMPAT 2418 .compat_ioctl = ocfs2_compat_ioctl, 2419 #endif 2420 .lock = ocfs2_lock, 2421 .flock = ocfs2_flock, 2422 .splice_read = ocfs2_file_splice_read, 2423 .splice_write = ocfs2_file_splice_write, 2424 }; 2425 2426 const struct file_operations ocfs2_dops = { 2427 .llseek = generic_file_llseek, 2428 .read = generic_read_dir, 2429 .readdir = ocfs2_readdir, 2430 .fsync = ocfs2_sync_file, 2431 .release = ocfs2_dir_release, 2432 .open = ocfs2_dir_open, 2433 .unlocked_ioctl = ocfs2_ioctl, 2434 #ifdef CONFIG_COMPAT 2435 .compat_ioctl = ocfs2_compat_ioctl, 2436 #endif 2437 .lock = ocfs2_lock, 2438 .flock = ocfs2_flock, 2439 }; 2440 2441 /* 2442 * POSIX-lockless variants of our file_operations. 2443 * 2444 * These will be used if the underlying cluster stack does not support 2445 * posix file locking, if the user passes the "localflocks" mount 2446 * option, or if we have a local-only fs. 2447 * 2448 * ocfs2_flock is in here because all stacks handle UNIX file locks, 2449 * so we still want it in the case of no stack support for 2450 * plocks. Internally, it will do the right thing when asked to ignore 2451 * the cluster. 2452 */ 2453 const struct file_operations ocfs2_fops_no_plocks = { 2454 .llseek = generic_file_llseek, 2455 .read = do_sync_read, 2456 .write = do_sync_write, 2457 .mmap = ocfs2_mmap, 2458 .fsync = ocfs2_sync_file, 2459 .release = ocfs2_file_release, 2460 .open = ocfs2_file_open, 2461 .aio_read = ocfs2_file_aio_read, 2462 .aio_write = ocfs2_file_aio_write, 2463 .unlocked_ioctl = ocfs2_ioctl, 2464 #ifdef CONFIG_COMPAT 2465 .compat_ioctl = ocfs2_compat_ioctl, 2466 #endif 2467 .flock = ocfs2_flock, 2468 .splice_read = ocfs2_file_splice_read, 2469 .splice_write = ocfs2_file_splice_write, 2470 }; 2471 2472 const struct file_operations ocfs2_dops_no_plocks = { 2473 .llseek = generic_file_llseek, 2474 .read = generic_read_dir, 2475 .readdir = ocfs2_readdir, 2476 .fsync = ocfs2_sync_file, 2477 .release = ocfs2_dir_release, 2478 .open = ocfs2_dir_open, 2479 .unlocked_ioctl = ocfs2_ioctl, 2480 #ifdef CONFIG_COMPAT 2481 .compat_ioctl = ocfs2_compat_ioctl, 2482 #endif 2483 .flock = ocfs2_flock, 2484 }; 2485