xref: /linux/fs/ocfs2/file.c (revision b0148a98ec5151fec82064d95f11eb9efbc628ea)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * file.c
5  *
6  * File open, close, extend, truncate
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 #include <linux/pagemap.h>
32 #include <linux/uio.h>
33 #include <linux/sched.h>
34 #include <linux/pipe_fs_i.h>
35 #include <linux/mount.h>
36 
37 #define MLOG_MASK_PREFIX ML_INODE
38 #include <cluster/masklog.h>
39 
40 #include "ocfs2.h"
41 
42 #include "alloc.h"
43 #include "aops.h"
44 #include "dir.h"
45 #include "dlmglue.h"
46 #include "extent_map.h"
47 #include "file.h"
48 #include "sysfile.h"
49 #include "inode.h"
50 #include "ioctl.h"
51 #include "journal.h"
52 #include "mmap.h"
53 #include "suballoc.h"
54 #include "super.h"
55 
56 #include "buffer_head_io.h"
57 
58 static int ocfs2_sync_inode(struct inode *inode)
59 {
60 	filemap_fdatawrite(inode->i_mapping);
61 	return sync_mapping_buffers(inode->i_mapping);
62 }
63 
64 static int ocfs2_file_open(struct inode *inode, struct file *file)
65 {
66 	int status;
67 	int mode = file->f_flags;
68 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
69 
70 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
71 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
72 
73 	spin_lock(&oi->ip_lock);
74 
75 	/* Check that the inode hasn't been wiped from disk by another
76 	 * node. If it hasn't then we're safe as long as we hold the
77 	 * spin lock until our increment of open count. */
78 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
79 		spin_unlock(&oi->ip_lock);
80 
81 		status = -ENOENT;
82 		goto leave;
83 	}
84 
85 	if (mode & O_DIRECT)
86 		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
87 
88 	oi->ip_open_count++;
89 	spin_unlock(&oi->ip_lock);
90 	status = 0;
91 leave:
92 	mlog_exit(status);
93 	return status;
94 }
95 
96 static int ocfs2_file_release(struct inode *inode, struct file *file)
97 {
98 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
99 
100 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
101 		       file->f_path.dentry->d_name.len,
102 		       file->f_path.dentry->d_name.name);
103 
104 	spin_lock(&oi->ip_lock);
105 	if (!--oi->ip_open_count)
106 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
107 	spin_unlock(&oi->ip_lock);
108 
109 	mlog_exit(0);
110 
111 	return 0;
112 }
113 
114 static int ocfs2_sync_file(struct file *file,
115 			   struct dentry *dentry,
116 			   int datasync)
117 {
118 	int err = 0;
119 	journal_t *journal;
120 	struct inode *inode = dentry->d_inode;
121 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
122 
123 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
124 		   dentry->d_name.len, dentry->d_name.name);
125 
126 	err = ocfs2_sync_inode(dentry->d_inode);
127 	if (err)
128 		goto bail;
129 
130 	journal = osb->journal->j_journal;
131 	err = journal_force_commit(journal);
132 
133 bail:
134 	mlog_exit(err);
135 
136 	return (err < 0) ? -EIO : 0;
137 }
138 
139 int ocfs2_should_update_atime(struct inode *inode,
140 			      struct vfsmount *vfsmnt)
141 {
142 	struct timespec now;
143 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 
145 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
146 		return 0;
147 
148 	if ((inode->i_flags & S_NOATIME) ||
149 	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
150 		return 0;
151 
152 	/*
153 	 * We can be called with no vfsmnt structure - NFSD will
154 	 * sometimes do this.
155 	 *
156 	 * Note that our action here is different than touch_atime() -
157 	 * if we can't tell whether this is a noatime mount, then we
158 	 * don't know whether to trust the value of s_atime_quantum.
159 	 */
160 	if (vfsmnt == NULL)
161 		return 0;
162 
163 	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
164 	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
165 		return 0;
166 
167 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
168 		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
169 		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
170 			return 1;
171 
172 		return 0;
173 	}
174 
175 	now = CURRENT_TIME;
176 	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
177 		return 0;
178 	else
179 		return 1;
180 }
181 
182 int ocfs2_update_inode_atime(struct inode *inode,
183 			     struct buffer_head *bh)
184 {
185 	int ret;
186 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
187 	handle_t *handle;
188 
189 	mlog_entry_void();
190 
191 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
192 	if (handle == NULL) {
193 		ret = -ENOMEM;
194 		mlog_errno(ret);
195 		goto out;
196 	}
197 
198 	inode->i_atime = CURRENT_TIME;
199 	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
200 	if (ret < 0)
201 		mlog_errno(ret);
202 
203 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
204 out:
205 	mlog_exit(ret);
206 	return ret;
207 }
208 
209 int ocfs2_set_inode_size(handle_t *handle,
210 			 struct inode *inode,
211 			 struct buffer_head *fe_bh,
212 			 u64 new_i_size)
213 {
214 	int status;
215 
216 	mlog_entry_void();
217 	i_size_write(inode, new_i_size);
218 	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
219 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
220 
221 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
222 	if (status < 0) {
223 		mlog_errno(status);
224 		goto bail;
225 	}
226 
227 bail:
228 	mlog_exit(status);
229 	return status;
230 }
231 
232 static int ocfs2_simple_size_update(struct inode *inode,
233 				    struct buffer_head *di_bh,
234 				    u64 new_i_size)
235 {
236 	int ret;
237 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
238 	handle_t *handle = NULL;
239 
240 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
241 	if (handle == NULL) {
242 		ret = -ENOMEM;
243 		mlog_errno(ret);
244 		goto out;
245 	}
246 
247 	ret = ocfs2_set_inode_size(handle, inode, di_bh,
248 				   new_i_size);
249 	if (ret < 0)
250 		mlog_errno(ret);
251 
252 	ocfs2_commit_trans(osb, handle);
253 out:
254 	return ret;
255 }
256 
257 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
258 				     struct inode *inode,
259 				     struct buffer_head *fe_bh,
260 				     u64 new_i_size)
261 {
262 	int status;
263 	handle_t *handle;
264 
265 	mlog_entry_void();
266 
267 	/* TODO: This needs to actually orphan the inode in this
268 	 * transaction. */
269 
270 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
271 	if (IS_ERR(handle)) {
272 		status = PTR_ERR(handle);
273 		mlog_errno(status);
274 		goto out;
275 	}
276 
277 	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
278 	if (status < 0)
279 		mlog_errno(status);
280 
281 	ocfs2_commit_trans(osb, handle);
282 out:
283 	mlog_exit(status);
284 	return status;
285 }
286 
287 static int ocfs2_truncate_file(struct inode *inode,
288 			       struct buffer_head *di_bh,
289 			       u64 new_i_size)
290 {
291 	int status = 0;
292 	struct ocfs2_dinode *fe = NULL;
293 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
294 	struct ocfs2_truncate_context *tc = NULL;
295 
296 	mlog_entry("(inode = %llu, new_i_size = %llu\n",
297 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
298 		   (unsigned long long)new_i_size);
299 
300 	truncate_inode_pages(inode->i_mapping, new_i_size);
301 
302 	fe = (struct ocfs2_dinode *) di_bh->b_data;
303 	if (!OCFS2_IS_VALID_DINODE(fe)) {
304 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
305 		status = -EIO;
306 		goto bail;
307 	}
308 
309 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
310 			"Inode %llu, inode i_size = %lld != di "
311 			"i_size = %llu, i_flags = 0x%x\n",
312 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
313 			i_size_read(inode),
314 			(unsigned long long)le64_to_cpu(fe->i_size),
315 			le32_to_cpu(fe->i_flags));
316 
317 	if (new_i_size > le64_to_cpu(fe->i_size)) {
318 		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
319 		     (unsigned long long)le64_to_cpu(fe->i_size),
320 		     (unsigned long long)new_i_size);
321 		status = -EINVAL;
322 		mlog_errno(status);
323 		goto bail;
324 	}
325 
326 	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
327 	     (unsigned long long)le64_to_cpu(fe->i_blkno),
328 	     (unsigned long long)le64_to_cpu(fe->i_size),
329 	     (unsigned long long)new_i_size);
330 
331 	/* lets handle the simple truncate cases before doing any more
332 	 * cluster locking. */
333 	if (new_i_size == le64_to_cpu(fe->i_size))
334 		goto bail;
335 
336 	/* This forces other nodes to sync and drop their pages. Do
337 	 * this even if we have a truncate without allocation change -
338 	 * ocfs2 cluster sizes can be much greater than page size, so
339 	 * we have to truncate them anyway.  */
340 	status = ocfs2_data_lock(inode, 1);
341 	if (status < 0) {
342 		mlog_errno(status);
343 		goto bail;
344 	}
345 	ocfs2_data_unlock(inode, 1);
346 
347 	if (le32_to_cpu(fe->i_clusters) ==
348 	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
349 		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
350 		     fe->i_clusters);
351 		/* No allocation change is required, so lets fast path
352 		 * this truncate. */
353 		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
354 		if (status < 0)
355 			mlog_errno(status);
356 		goto bail;
357 	}
358 
359 	/* alright, we're going to need to do a full blown alloc size
360 	 * change. Orphan the inode so that recovery can complete the
361 	 * truncate if necessary. This does the task of marking
362 	 * i_size. */
363 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
364 	if (status < 0) {
365 		mlog_errno(status);
366 		goto bail;
367 	}
368 
369 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
370 	if (status < 0) {
371 		mlog_errno(status);
372 		goto bail;
373 	}
374 
375 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
376 	if (status < 0) {
377 		mlog_errno(status);
378 		goto bail;
379 	}
380 
381 	/* TODO: orphan dir cleanup here. */
382 bail:
383 
384 	mlog_exit(status);
385 	return status;
386 }
387 
388 /*
389  * extend allocation only here.
390  * we'll update all the disk stuff, and oip->alloc_size
391  *
392  * expect stuff to be locked, a transaction started and enough data /
393  * metadata reservations in the contexts.
394  *
395  * Will return -EAGAIN, and a reason if a restart is needed.
396  * If passed in, *reason will always be set, even in error.
397  */
398 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
399 			       struct inode *inode,
400 			       u32 clusters_to_add,
401 			       struct buffer_head *fe_bh,
402 			       handle_t *handle,
403 			       struct ocfs2_alloc_context *data_ac,
404 			       struct ocfs2_alloc_context *meta_ac,
405 			       enum ocfs2_alloc_restarted *reason_ret)
406 {
407 	int status = 0;
408 	int free_extents;
409 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
410 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
411 	u32 bit_off, num_bits;
412 	u64 block;
413 
414 	BUG_ON(!clusters_to_add);
415 
416 	free_extents = ocfs2_num_free_extents(osb, inode, fe);
417 	if (free_extents < 0) {
418 		status = free_extents;
419 		mlog_errno(status);
420 		goto leave;
421 	}
422 
423 	/* there are two cases which could cause us to EAGAIN in the
424 	 * we-need-more-metadata case:
425 	 * 1) we haven't reserved *any*
426 	 * 2) we are so fragmented, we've needed to add metadata too
427 	 *    many times. */
428 	if (!free_extents && !meta_ac) {
429 		mlog(0, "we haven't reserved any metadata!\n");
430 		status = -EAGAIN;
431 		reason = RESTART_META;
432 		goto leave;
433 	} else if ((!free_extents)
434 		   && (ocfs2_alloc_context_bits_left(meta_ac)
435 		       < ocfs2_extend_meta_needed(fe))) {
436 		mlog(0, "filesystem is really fragmented...\n");
437 		status = -EAGAIN;
438 		reason = RESTART_META;
439 		goto leave;
440 	}
441 
442 	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
443 				      &bit_off, &num_bits);
444 	if (status < 0) {
445 		if (status != -ENOSPC)
446 			mlog_errno(status);
447 		goto leave;
448 	}
449 
450 	BUG_ON(num_bits > clusters_to_add);
451 
452 	/* reserve our write early -- insert_extent may update the inode */
453 	status = ocfs2_journal_access(handle, inode, fe_bh,
454 				      OCFS2_JOURNAL_ACCESS_WRITE);
455 	if (status < 0) {
456 		mlog_errno(status);
457 		goto leave;
458 	}
459 
460 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
461 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
462 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
463 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
464 				     num_bits, meta_ac);
465 	if (status < 0) {
466 		mlog_errno(status);
467 		goto leave;
468 	}
469 
470 	le32_add_cpu(&fe->i_clusters, num_bits);
471 	spin_lock(&OCFS2_I(inode)->ip_lock);
472 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
473 	spin_unlock(&OCFS2_I(inode)->ip_lock);
474 
475 	status = ocfs2_journal_dirty(handle, fe_bh);
476 	if (status < 0) {
477 		mlog_errno(status);
478 		goto leave;
479 	}
480 
481 	clusters_to_add -= num_bits;
482 
483 	if (clusters_to_add) {
484 		mlog(0, "need to alloc once more, clusters = %u, wanted = "
485 		     "%u\n", fe->i_clusters, clusters_to_add);
486 		status = -EAGAIN;
487 		reason = RESTART_TRANS;
488 	}
489 
490 leave:
491 	mlog_exit(status);
492 	if (reason_ret)
493 		*reason_ret = reason;
494 	return status;
495 }
496 
497 static int ocfs2_extend_allocation(struct inode *inode,
498 				   u32 clusters_to_add)
499 {
500 	int status = 0;
501 	int restart_func = 0;
502 	int drop_alloc_sem = 0;
503 	int credits, num_free_extents;
504 	u32 prev_clusters;
505 	struct buffer_head *bh = NULL;
506 	struct ocfs2_dinode *fe = NULL;
507 	handle_t *handle = NULL;
508 	struct ocfs2_alloc_context *data_ac = NULL;
509 	struct ocfs2_alloc_context *meta_ac = NULL;
510 	enum ocfs2_alloc_restarted why;
511 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
512 
513 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
514 
515 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
516 				  OCFS2_BH_CACHED, inode);
517 	if (status < 0) {
518 		mlog_errno(status);
519 		goto leave;
520 	}
521 
522 	fe = (struct ocfs2_dinode *) bh->b_data;
523 	if (!OCFS2_IS_VALID_DINODE(fe)) {
524 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
525 		status = -EIO;
526 		goto leave;
527 	}
528 
529 restart_all:
530 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
531 
532 	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
533 	     "clusters_to_add = %u\n",
534 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
535 	     fe->i_clusters, clusters_to_add);
536 
537 	num_free_extents = ocfs2_num_free_extents(osb,
538 						  inode,
539 						  fe);
540 	if (num_free_extents < 0) {
541 		status = num_free_extents;
542 		mlog_errno(status);
543 		goto leave;
544 	}
545 
546 	if (!num_free_extents) {
547 		status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
548 		if (status < 0) {
549 			if (status != -ENOSPC)
550 				mlog_errno(status);
551 			goto leave;
552 		}
553 	}
554 
555 	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
556 	if (status < 0) {
557 		if (status != -ENOSPC)
558 			mlog_errno(status);
559 		goto leave;
560 	}
561 
562 	/* blocks peope in read/write from reading our allocation
563 	 * until we're done changing it. We depend on i_mutex to block
564 	 * other extend/truncate calls while we're here. Ordering wrt
565 	 * start_trans is important here -- always do it before! */
566 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
567 	drop_alloc_sem = 1;
568 
569 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
570 	handle = ocfs2_start_trans(osb, credits);
571 	if (IS_ERR(handle)) {
572 		status = PTR_ERR(handle);
573 		handle = NULL;
574 		mlog_errno(status);
575 		goto leave;
576 	}
577 
578 restarted_transaction:
579 	/* reserve a write to the file entry early on - that we if we
580 	 * run out of credits in the allocation path, we can still
581 	 * update i_size. */
582 	status = ocfs2_journal_access(handle, inode, bh,
583 				      OCFS2_JOURNAL_ACCESS_WRITE);
584 	if (status < 0) {
585 		mlog_errno(status);
586 		goto leave;
587 	}
588 
589 	prev_clusters = OCFS2_I(inode)->ip_clusters;
590 
591 	status = ocfs2_do_extend_allocation(osb,
592 					    inode,
593 					    clusters_to_add,
594 					    bh,
595 					    handle,
596 					    data_ac,
597 					    meta_ac,
598 					    &why);
599 	if ((status < 0) && (status != -EAGAIN)) {
600 		if (status != -ENOSPC)
601 			mlog_errno(status);
602 		goto leave;
603 	}
604 
605 	status = ocfs2_journal_dirty(handle, bh);
606 	if (status < 0) {
607 		mlog_errno(status);
608 		goto leave;
609 	}
610 
611 	spin_lock(&OCFS2_I(inode)->ip_lock);
612 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
613 	spin_unlock(&OCFS2_I(inode)->ip_lock);
614 
615 	if (why != RESTART_NONE && clusters_to_add) {
616 		if (why == RESTART_META) {
617 			mlog(0, "restarting function.\n");
618 			restart_func = 1;
619 		} else {
620 			BUG_ON(why != RESTART_TRANS);
621 
622 			mlog(0, "restarting transaction.\n");
623 			/* TODO: This can be more intelligent. */
624 			credits = ocfs2_calc_extend_credits(osb->sb,
625 							    fe,
626 							    clusters_to_add);
627 			status = ocfs2_extend_trans(handle, credits);
628 			if (status < 0) {
629 				/* handle still has to be committed at
630 				 * this point. */
631 				status = -ENOMEM;
632 				mlog_errno(status);
633 				goto leave;
634 			}
635 			goto restarted_transaction;
636 		}
637 	}
638 
639 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
640 	     fe->i_clusters, (unsigned long long)fe->i_size);
641 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
642 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
643 
644 leave:
645 	if (drop_alloc_sem) {
646 		up_write(&OCFS2_I(inode)->ip_alloc_sem);
647 		drop_alloc_sem = 0;
648 	}
649 	if (handle) {
650 		ocfs2_commit_trans(osb, handle);
651 		handle = NULL;
652 	}
653 	if (data_ac) {
654 		ocfs2_free_alloc_context(data_ac);
655 		data_ac = NULL;
656 	}
657 	if (meta_ac) {
658 		ocfs2_free_alloc_context(meta_ac);
659 		meta_ac = NULL;
660 	}
661 	if ((!status) && restart_func) {
662 		restart_func = 0;
663 		goto restart_all;
664 	}
665 	if (bh) {
666 		brelse(bh);
667 		bh = NULL;
668 	}
669 
670 	mlog_exit(status);
671 	return status;
672 }
673 
674 /* Some parts of this taken from generic_cont_expand, which turned out
675  * to be too fragile to do exactly what we need without us having to
676  * worry about recursive locking in ->prepare_write() and
677  * ->commit_write(). */
678 static int ocfs2_write_zero_page(struct inode *inode,
679 				 u64 size)
680 {
681 	struct address_space *mapping = inode->i_mapping;
682 	struct page *page;
683 	unsigned long index;
684 	unsigned int offset;
685 	handle_t *handle = NULL;
686 	int ret;
687 
688 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
689 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
690 	** skip the prepare.  make sure we never send an offset for the start
691 	** of a block
692 	*/
693 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
694 		offset++;
695 	}
696 	index = size >> PAGE_CACHE_SHIFT;
697 
698 	page = grab_cache_page(mapping, index);
699 	if (!page) {
700 		ret = -ENOMEM;
701 		mlog_errno(ret);
702 		goto out;
703 	}
704 
705 	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
706 	if (ret < 0) {
707 		mlog_errno(ret);
708 		goto out_unlock;
709 	}
710 
711 	if (ocfs2_should_order_data(inode)) {
712 		handle = ocfs2_start_walk_page_trans(inode, page, offset,
713 						     offset);
714 		if (IS_ERR(handle)) {
715 			ret = PTR_ERR(handle);
716 			handle = NULL;
717 			goto out_unlock;
718 		}
719 	}
720 
721 	/* must not update i_size! */
722 	ret = block_commit_write(page, offset, offset);
723 	if (ret < 0)
724 		mlog_errno(ret);
725 	else
726 		ret = 0;
727 
728 	if (handle)
729 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
730 out_unlock:
731 	unlock_page(page);
732 	page_cache_release(page);
733 out:
734 	return ret;
735 }
736 
737 static int ocfs2_zero_extend(struct inode *inode,
738 			     u64 zero_to_size)
739 {
740 	int ret = 0;
741 	u64 start_off;
742 	struct super_block *sb = inode->i_sb;
743 
744 	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
745 	while (start_off < zero_to_size) {
746 		ret = ocfs2_write_zero_page(inode, start_off);
747 		if (ret < 0) {
748 			mlog_errno(ret);
749 			goto out;
750 		}
751 
752 		start_off += sb->s_blocksize;
753 
754 		/*
755 		 * Very large extends have the potential to lock up
756 		 * the cpu for extended periods of time.
757 		 */
758 		cond_resched();
759 	}
760 
761 out:
762 	return ret;
763 }
764 
765 /*
766  * A tail_to_skip value > 0 indicates that we're being called from
767  * ocfs2_file_aio_write(). This has the following implications:
768  *
769  * - we don't want to update i_size
770  * - di_bh will be NULL, which is fine because it's only used in the
771  *   case where we want to update i_size.
772  * - ocfs2_zero_extend() will then only be filling the hole created
773  *   between i_size and the start of the write.
774  */
775 static int ocfs2_extend_file(struct inode *inode,
776 			     struct buffer_head *di_bh,
777 			     u64 new_i_size,
778 			     size_t tail_to_skip)
779 {
780 	int ret = 0;
781 	u32 clusters_to_add;
782 
783 	BUG_ON(!tail_to_skip && !di_bh);
784 
785 	/* setattr sometimes calls us like this. */
786 	if (new_i_size == 0)
787 		goto out;
788 
789 	if (i_size_read(inode) == new_i_size)
790   		goto out;
791 	BUG_ON(new_i_size < i_size_read(inode));
792 
793 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
794 		OCFS2_I(inode)->ip_clusters;
795 
796 	/*
797 	 * protect the pages that ocfs2_zero_extend is going to be
798 	 * pulling into the page cache.. we do this before the
799 	 * metadata extend so that we don't get into the situation
800 	 * where we've extended the metadata but can't get the data
801 	 * lock to zero.
802 	 */
803 	ret = ocfs2_data_lock(inode, 1);
804 	if (ret < 0) {
805 		mlog_errno(ret);
806 		goto out;
807 	}
808 
809 	if (clusters_to_add) {
810 		ret = ocfs2_extend_allocation(inode, clusters_to_add);
811 		if (ret < 0) {
812 			mlog_errno(ret);
813 			goto out_unlock;
814 		}
815 	}
816 
817 	/*
818 	 * Call this even if we don't add any clusters to the tree. We
819 	 * still need to zero the area between the old i_size and the
820 	 * new i_size.
821 	 */
822 	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
823 	if (ret < 0) {
824 		mlog_errno(ret);
825 		goto out_unlock;
826 	}
827 
828 	if (!tail_to_skip) {
829 		/* We're being called from ocfs2_setattr() which wants
830 		 * us to update i_size */
831 		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
832 		if (ret < 0)
833 			mlog_errno(ret);
834 	}
835 
836 out_unlock:
837 	ocfs2_data_unlock(inode, 1);
838 
839 out:
840 	return ret;
841 }
842 
843 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
844 {
845 	int status = 0, size_change;
846 	struct inode *inode = dentry->d_inode;
847 	struct super_block *sb = inode->i_sb;
848 	struct ocfs2_super *osb = OCFS2_SB(sb);
849 	struct buffer_head *bh = NULL;
850 	handle_t *handle = NULL;
851 
852 	mlog_entry("(0x%p, '%.*s')\n", dentry,
853 	           dentry->d_name.len, dentry->d_name.name);
854 
855 	if (attr->ia_valid & ATTR_MODE)
856 		mlog(0, "mode change: %d\n", attr->ia_mode);
857 	if (attr->ia_valid & ATTR_UID)
858 		mlog(0, "uid change: %d\n", attr->ia_uid);
859 	if (attr->ia_valid & ATTR_GID)
860 		mlog(0, "gid change: %d\n", attr->ia_gid);
861 	if (attr->ia_valid & ATTR_SIZE)
862 		mlog(0, "size change...\n");
863 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
864 		mlog(0, "time change...\n");
865 
866 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
867 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
868 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
869 		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
870 		return 0;
871 	}
872 
873 	status = inode_change_ok(inode, attr);
874 	if (status)
875 		return status;
876 
877 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
878 	if (size_change) {
879 		status = ocfs2_rw_lock(inode, 1);
880 		if (status < 0) {
881 			mlog_errno(status);
882 			goto bail;
883 		}
884 	}
885 
886 	status = ocfs2_meta_lock(inode, &bh, 1);
887 	if (status < 0) {
888 		if (status != -ENOENT)
889 			mlog_errno(status);
890 		goto bail_unlock_rw;
891 	}
892 
893 	if (size_change && attr->ia_size != i_size_read(inode)) {
894 		if (i_size_read(inode) > attr->ia_size)
895 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
896 		else
897 			status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
898 		if (status < 0) {
899 			if (status != -ENOSPC)
900 				mlog_errno(status);
901 			status = -ENOSPC;
902 			goto bail_unlock;
903 		}
904 	}
905 
906 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
907 	if (IS_ERR(handle)) {
908 		status = PTR_ERR(handle);
909 		mlog_errno(status);
910 		goto bail_unlock;
911 	}
912 
913 	status = inode_setattr(inode, attr);
914 	if (status < 0) {
915 		mlog_errno(status);
916 		goto bail_commit;
917 	}
918 
919 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
920 	if (status < 0)
921 		mlog_errno(status);
922 
923 bail_commit:
924 	ocfs2_commit_trans(osb, handle);
925 bail_unlock:
926 	ocfs2_meta_unlock(inode, 1);
927 bail_unlock_rw:
928 	if (size_change)
929 		ocfs2_rw_unlock(inode, 1);
930 bail:
931 	if (bh)
932 		brelse(bh);
933 
934 	mlog_exit(status);
935 	return status;
936 }
937 
938 int ocfs2_getattr(struct vfsmount *mnt,
939 		  struct dentry *dentry,
940 		  struct kstat *stat)
941 {
942 	struct inode *inode = dentry->d_inode;
943 	struct super_block *sb = dentry->d_inode->i_sb;
944 	struct ocfs2_super *osb = sb->s_fs_info;
945 	int err;
946 
947 	mlog_entry_void();
948 
949 	err = ocfs2_inode_revalidate(dentry);
950 	if (err) {
951 		if (err != -ENOENT)
952 			mlog_errno(err);
953 		goto bail;
954 	}
955 
956 	generic_fillattr(inode, stat);
957 
958 	/* We set the blksize from the cluster size for performance */
959 	stat->blksize = osb->s_clustersize;
960 
961 bail:
962 	mlog_exit(err);
963 
964 	return err;
965 }
966 
967 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
968 {
969 	int ret;
970 
971 	mlog_entry_void();
972 
973 	ret = ocfs2_meta_lock(inode, NULL, 0);
974 	if (ret) {
975 		mlog_errno(ret);
976 		goto out;
977 	}
978 
979 	ret = generic_permission(inode, mask, NULL);
980 
981 	ocfs2_meta_unlock(inode, 0);
982 out:
983 	mlog_exit(ret);
984 	return ret;
985 }
986 
987 static int ocfs2_write_remove_suid(struct inode *inode)
988 {
989 	int ret;
990 	struct buffer_head *bh = NULL;
991 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
992 	handle_t *handle;
993 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
994 	struct ocfs2_dinode *di;
995 
996 	mlog_entry("(Inode %llu, mode 0%o)\n",
997 		   (unsigned long long)oi->ip_blkno, inode->i_mode);
998 
999 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1000 	if (handle == NULL) {
1001 		ret = -ENOMEM;
1002 		mlog_errno(ret);
1003 		goto out;
1004 	}
1005 
1006 	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1007 	if (ret < 0) {
1008 		mlog_errno(ret);
1009 		goto out_trans;
1010 	}
1011 
1012 	ret = ocfs2_journal_access(handle, inode, bh,
1013 				   OCFS2_JOURNAL_ACCESS_WRITE);
1014 	if (ret < 0) {
1015 		mlog_errno(ret);
1016 		goto out_bh;
1017 	}
1018 
1019 	inode->i_mode &= ~S_ISUID;
1020 	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1021 		inode->i_mode &= ~S_ISGID;
1022 
1023 	di = (struct ocfs2_dinode *) bh->b_data;
1024 	di->i_mode = cpu_to_le16(inode->i_mode);
1025 
1026 	ret = ocfs2_journal_dirty(handle, bh);
1027 	if (ret < 0)
1028 		mlog_errno(ret);
1029 out_bh:
1030 	brelse(bh);
1031 out_trans:
1032 	ocfs2_commit_trans(osb, handle);
1033 out:
1034 	mlog_exit(ret);
1035 	return ret;
1036 }
1037 
1038 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1039 					 loff_t *ppos,
1040 					 size_t count,
1041 					 int appending)
1042 {
1043 	int ret = 0, meta_level = appending;
1044 	struct inode *inode = dentry->d_inode;
1045 	u32 clusters;
1046 	loff_t newsize, saved_pos;
1047 
1048 	/*
1049 	 * We sample i_size under a read level meta lock to see if our write
1050 	 * is extending the file, if it is we back off and get a write level
1051 	 * meta lock.
1052 	 */
1053 	for(;;) {
1054 		ret = ocfs2_meta_lock(inode, NULL, meta_level);
1055 		if (ret < 0) {
1056 			meta_level = -1;
1057 			mlog_errno(ret);
1058 			goto out;
1059 		}
1060 
1061 		/* Clear suid / sgid if necessary. We do this here
1062 		 * instead of later in the write path because
1063 		 * remove_suid() calls ->setattr without any hint that
1064 		 * we may have already done our cluster locking. Since
1065 		 * ocfs2_setattr() *must* take cluster locks to
1066 		 * proceeed, this will lead us to recursively lock the
1067 		 * inode. There's also the dinode i_size state which
1068 		 * can be lost via setattr during extending writes (we
1069 		 * set inode->i_size at the end of a write. */
1070 		if (should_remove_suid(dentry)) {
1071 			if (meta_level == 0) {
1072 				ocfs2_meta_unlock(inode, meta_level);
1073 				meta_level = 1;
1074 				continue;
1075 			}
1076 
1077 			ret = ocfs2_write_remove_suid(inode);
1078 			if (ret < 0) {
1079 				mlog_errno(ret);
1080 				goto out_unlock;
1081 			}
1082 		}
1083 
1084 		/* work on a copy of ppos until we're sure that we won't have
1085 		 * to recalculate it due to relocking. */
1086 		if (appending) {
1087 			saved_pos = i_size_read(inode);
1088 			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1089 		} else {
1090 			saved_pos = *ppos;
1091 		}
1092 		newsize = count + saved_pos;
1093 
1094 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1095 		     (long long) saved_pos, (long long) newsize,
1096 		     (long long) i_size_read(inode));
1097 
1098 		/* No need for a higher level metadata lock if we're
1099 		 * never going past i_size. */
1100 		if (newsize <= i_size_read(inode))
1101 			break;
1102 
1103 		if (meta_level == 0) {
1104 			ocfs2_meta_unlock(inode, meta_level);
1105 			meta_level = 1;
1106 			continue;
1107 		}
1108 
1109 		spin_lock(&OCFS2_I(inode)->ip_lock);
1110 		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1111 			OCFS2_I(inode)->ip_clusters;
1112 		spin_unlock(&OCFS2_I(inode)->ip_lock);
1113 
1114 		mlog(0, "Writing at EOF, may need more allocation: "
1115 		     "i_size = %lld, newsize = %lld, need %u clusters\n",
1116 		     (long long) i_size_read(inode), (long long) newsize,
1117 		     clusters);
1118 
1119 		/* We only want to continue the rest of this loop if
1120 		 * our extend will actually require more
1121 		 * allocation. */
1122 		if (!clusters)
1123 			break;
1124 
1125 		ret = ocfs2_extend_file(inode, NULL, newsize, count);
1126 		if (ret < 0) {
1127 			if (ret != -ENOSPC)
1128 				mlog_errno(ret);
1129 			goto out_unlock;
1130 		}
1131 		break;
1132 	}
1133 
1134 	if (appending)
1135 		*ppos = saved_pos;
1136 
1137 out_unlock:
1138 	ocfs2_meta_unlock(inode, meta_level);
1139 
1140 out:
1141 	return ret;
1142 }
1143 
1144 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1145 				    const struct iovec *iov,
1146 				    unsigned long nr_segs,
1147 				    loff_t pos)
1148 {
1149 	int ret, rw_level, have_alloc_sem = 0;
1150 	struct file *filp = iocb->ki_filp;
1151 	struct inode *inode = filp->f_path.dentry->d_inode;
1152 	int appending = filp->f_flags & O_APPEND ? 1 : 0;
1153 
1154 	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1155 		   (unsigned int)nr_segs,
1156 		   filp->f_path.dentry->d_name.len,
1157 		   filp->f_path.dentry->d_name.name);
1158 
1159 	/* happy write of zero bytes */
1160 	if (iocb->ki_left == 0)
1161 		return 0;
1162 
1163 	mutex_lock(&inode->i_mutex);
1164 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1165 	if (filp->f_flags & O_DIRECT) {
1166 		have_alloc_sem = 1;
1167 		down_read(&inode->i_alloc_sem);
1168 	}
1169 
1170 	/* concurrent O_DIRECT writes are allowed */
1171 	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
1172 	ret = ocfs2_rw_lock(inode, rw_level);
1173 	if (ret < 0) {
1174 		rw_level = -1;
1175 		mlog_errno(ret);
1176 		goto out;
1177 	}
1178 
1179 	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
1180 					    iocb->ki_left, appending);
1181 	if (ret < 0) {
1182 		mlog_errno(ret);
1183 		goto out;
1184 	}
1185 
1186 	/* communicate with ocfs2_dio_end_io */
1187 	ocfs2_iocb_set_rw_locked(iocb);
1188 
1189 	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
1190 
1191 	/* buffered aio wouldn't have proper lock coverage today */
1192 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1193 
1194 	/*
1195 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1196 	 * function pointer which is called when o_direct io completes so that
1197 	 * it can unlock our rw lock.  (it's the clustered equivalent of
1198 	 * i_alloc_sem; protects truncate from racing with pending ios).
1199 	 * Unfortunately there are error cases which call end_io and others
1200 	 * that don't.  so we don't have to unlock the rw_lock if either an
1201 	 * async dio is going to do it in the future or an end_io after an
1202 	 * error has already done it.
1203 	 */
1204 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1205 		rw_level = -1;
1206 		have_alloc_sem = 0;
1207 	}
1208 
1209 out:
1210 	if (have_alloc_sem)
1211 		up_read(&inode->i_alloc_sem);
1212 	if (rw_level != -1)
1213 		ocfs2_rw_unlock(inode, rw_level);
1214 	mutex_unlock(&inode->i_mutex);
1215 
1216 	mlog_exit(ret);
1217 	return ret;
1218 }
1219 
1220 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1221 				       struct file *out,
1222 				       loff_t *ppos,
1223 				       size_t len,
1224 				       unsigned int flags)
1225 {
1226 	int ret;
1227 	struct inode *inode = out->f_path.dentry->d_inode;
1228 
1229 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1230 		   (unsigned int)len,
1231 		   out->f_path.dentry->d_name.len,
1232 		   out->f_path.dentry->d_name.name);
1233 
1234 	inode_double_lock(inode, pipe->inode);
1235 
1236 	ret = ocfs2_rw_lock(inode, 1);
1237 	if (ret < 0) {
1238 		mlog_errno(ret);
1239 		goto out;
1240 	}
1241 
1242 	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
1243 	if (ret < 0) {
1244 		mlog_errno(ret);
1245 		goto out_unlock;
1246 	}
1247 
1248 	/* ok, we're done with i_size and alloc work */
1249 	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1250 
1251 out_unlock:
1252 	ocfs2_rw_unlock(inode, 1);
1253 out:
1254 	inode_double_unlock(inode, pipe->inode);
1255 
1256 	mlog_exit(ret);
1257 	return ret;
1258 }
1259 
1260 static ssize_t ocfs2_file_splice_read(struct file *in,
1261 				      loff_t *ppos,
1262 				      struct pipe_inode_info *pipe,
1263 				      size_t len,
1264 				      unsigned int flags)
1265 {
1266 	int ret = 0;
1267 	struct inode *inode = in->f_path.dentry->d_inode;
1268 
1269 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1270 		   (unsigned int)len,
1271 		   in->f_path.dentry->d_name.len,
1272 		   in->f_path.dentry->d_name.name);
1273 
1274 	/*
1275 	 * See the comment in ocfs2_file_aio_read()
1276 	 */
1277 	ret = ocfs2_meta_lock(inode, NULL, 0);
1278 	if (ret < 0) {
1279 		mlog_errno(ret);
1280 		goto bail;
1281 	}
1282 	ocfs2_meta_unlock(inode, 0);
1283 
1284 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1285 
1286 bail:
1287 	mlog_exit(ret);
1288 	return ret;
1289 }
1290 
1291 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1292 				   const struct iovec *iov,
1293 				   unsigned long nr_segs,
1294 				   loff_t pos)
1295 {
1296 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1297 	struct file *filp = iocb->ki_filp;
1298 	struct inode *inode = filp->f_path.dentry->d_inode;
1299 
1300 	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1301 		   (unsigned int)nr_segs,
1302 		   filp->f_path.dentry->d_name.len,
1303 		   filp->f_path.dentry->d_name.name);
1304 
1305 	if (!inode) {
1306 		ret = -EINVAL;
1307 		mlog_errno(ret);
1308 		goto bail;
1309 	}
1310 
1311 	/*
1312 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
1313 	 * need locks to protect pending reads from racing with truncate.
1314 	 */
1315 	if (filp->f_flags & O_DIRECT) {
1316 		down_read(&inode->i_alloc_sem);
1317 		have_alloc_sem = 1;
1318 
1319 		ret = ocfs2_rw_lock(inode, 0);
1320 		if (ret < 0) {
1321 			mlog_errno(ret);
1322 			goto bail;
1323 		}
1324 		rw_level = 0;
1325 		/* communicate with ocfs2_dio_end_io */
1326 		ocfs2_iocb_set_rw_locked(iocb);
1327 	}
1328 
1329 	/*
1330 	 * We're fine letting folks race truncates and extending
1331 	 * writes with read across the cluster, just like they can
1332 	 * locally. Hence no rw_lock during read.
1333 	 *
1334 	 * Take and drop the meta data lock to update inode fields
1335 	 * like i_size. This allows the checks down below
1336 	 * generic_file_aio_read() a chance of actually working.
1337 	 */
1338 	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
1339 	if (ret < 0) {
1340 		mlog_errno(ret);
1341 		goto bail;
1342 	}
1343 	ocfs2_meta_unlock(inode, lock_level);
1344 
1345 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
1346 	if (ret == -EINVAL)
1347 		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1348 
1349 	/* buffered aio wouldn't have proper lock coverage today */
1350 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1351 
1352 	/* see ocfs2_file_aio_write */
1353 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1354 		rw_level = -1;
1355 		have_alloc_sem = 0;
1356 	}
1357 
1358 bail:
1359 	if (have_alloc_sem)
1360 		up_read(&inode->i_alloc_sem);
1361 	if (rw_level != -1)
1362 		ocfs2_rw_unlock(inode, rw_level);
1363 	mlog_exit(ret);
1364 
1365 	return ret;
1366 }
1367 
1368 struct inode_operations ocfs2_file_iops = {
1369 	.setattr	= ocfs2_setattr,
1370 	.getattr	= ocfs2_getattr,
1371 	.permission	= ocfs2_permission,
1372 };
1373 
1374 struct inode_operations ocfs2_special_file_iops = {
1375 	.setattr	= ocfs2_setattr,
1376 	.getattr	= ocfs2_getattr,
1377 	.permission	= ocfs2_permission,
1378 };
1379 
1380 const struct file_operations ocfs2_fops = {
1381 	.read		= do_sync_read,
1382 	.write		= do_sync_write,
1383 	.sendfile	= generic_file_sendfile,
1384 	.mmap		= ocfs2_mmap,
1385 	.fsync		= ocfs2_sync_file,
1386 	.release	= ocfs2_file_release,
1387 	.open		= ocfs2_file_open,
1388 	.aio_read	= ocfs2_file_aio_read,
1389 	.aio_write	= ocfs2_file_aio_write,
1390 	.ioctl		= ocfs2_ioctl,
1391 	.splice_read	= ocfs2_file_splice_read,
1392 	.splice_write	= ocfs2_file_splice_write,
1393 };
1394 
1395 const struct file_operations ocfs2_dops = {
1396 	.read		= generic_read_dir,
1397 	.readdir	= ocfs2_readdir,
1398 	.fsync		= ocfs2_sync_file,
1399 	.ioctl		= ocfs2_ioctl,
1400 };
1401