1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * Portions of this source code were derived from Berkeley 4.3 BSD
31 * under license from the Regents of the University of California.
32 */
33
34 #include <sys/sysmacros.h>
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/systm.h>
38 #include <sys/t_lock.h>
39 #include <sys/uio.h>
40 #include <sys/kmem.h>
41 #include <sys/thread.h>
42 #include <sys/vfs.h>
43 #include <sys/errno.h>
44 #include <sys/buf.h>
45 #include <sys/vnode.h>
46 #include <sys/fs/ufs_trans.h>
47 #include <sys/fs/ufs_inode.h>
48 #include <sys/fs/ufs_fs.h>
49 #include <sys/fs/ufs_fsdir.h>
50 #include <sys/fs/ufs_quota.h>
51 #include <sys/fs/ufs_panic.h>
52 #include <sys/fs/ufs_bio.h>
53 #include <sys/fs/ufs_log.h>
54 #include <sys/cmn_err.h>
55 #include <sys/file.h>
56 #include <sys/debug.h>
57
58
59 extern kmutex_t ufsvfs_mutex;
60 extern struct ufsvfs *ufs_instances;
61
62 /*
63 * hlock any file systems w/errored logs
64 */
65 int
ufs_trans_hlock()66 ufs_trans_hlock()
67 {
68 struct ufsvfs *ufsvfsp;
69 struct lockfs lockfs;
70 int error;
71 int retry = 0;
72
73 /*
74 * find fs's that paniced or have errored logging devices
75 */
76 mutex_enter(&ufsvfs_mutex);
77 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next) {
78 /*
79 * not mounted; continue
80 */
81 if ((ufsvfsp->vfs_vfs == NULL) ||
82 (ufsvfsp->vfs_validfs == UT_UNMOUNTED))
83 continue;
84 /*
85 * disallow unmounts (hlock occurs below)
86 */
87 if (TRANS_ISERROR(ufsvfsp))
88 ufsvfsp->vfs_validfs = UT_HLOCKING;
89 }
90 mutex_exit(&ufsvfs_mutex);
91
92 /*
93 * hlock the fs's that paniced or have errored logging devices
94 */
95 again:
96 mutex_enter(&ufsvfs_mutex);
97 for (ufsvfsp = ufs_instances; ufsvfsp; ufsvfsp = ufsvfsp->vfs_next)
98 if (ufsvfsp->vfs_validfs == UT_HLOCKING)
99 break;
100 mutex_exit(&ufsvfs_mutex);
101 if (ufsvfsp == NULL)
102 return (retry);
103 /*
104 * hlock the file system
105 */
106 (void) ufs_fiolfss(ufsvfsp->vfs_root, &lockfs);
107 if (!LOCKFS_IS_ELOCK(&lockfs)) {
108 lockfs.lf_lock = LOCKFS_HLOCK;
109 lockfs.lf_flags = 0;
110 lockfs.lf_comlen = 0;
111 lockfs.lf_comment = NULL;
112 error = ufs_fiolfs(ufsvfsp->vfs_root, &lockfs, 0);
113 /*
114 * retry after awhile; another app currently doing lockfs
115 */
116 if (error == EBUSY || error == EINVAL)
117 retry = 1;
118 } else {
119 if (ufsfx_get_failure_qlen() > 0) {
120 if (mutex_tryenter(&ufs_fix.uq_mutex)) {
121 ufs_fix.uq_lowat = ufs_fix.uq_ne;
122 cv_broadcast(&ufs_fix.uq_cv);
123 mutex_exit(&ufs_fix.uq_mutex);
124 }
125 }
126 retry = 1;
127 }
128
129 /*
130 * allow unmounts
131 */
132 ufsvfsp->vfs_validfs = UT_MOUNTED;
133 goto again;
134 }
135
136 /*ARGSUSED*/
137 void
ufs_trans_onerror()138 ufs_trans_onerror()
139 {
140 mutex_enter(&ufs_hlock.uq_mutex);
141 ufs_hlock.uq_ne = ufs_hlock.uq_lowat;
142 cv_broadcast(&ufs_hlock.uq_cv);
143 mutex_exit(&ufs_hlock.uq_mutex);
144 }
145
146 void
ufs_trans_sbupdate(struct ufsvfs * ufsvfsp,struct vfs * vfsp,top_t topid)147 ufs_trans_sbupdate(struct ufsvfs *ufsvfsp, struct vfs *vfsp, top_t topid)
148 {
149 if (curthread->t_flag & T_DONTBLOCK) {
150 sbupdate(vfsp);
151 return;
152 } else {
153
154 if (panicstr && TRANS_ISTRANS(ufsvfsp))
155 return;
156
157 curthread->t_flag |= T_DONTBLOCK;
158 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
159 sbupdate(vfsp);
160 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBUPDATE_SIZE);
161 curthread->t_flag &= ~T_DONTBLOCK;
162 }
163 }
164
165 void
ufs_trans_iupdat(struct inode * ip,int waitfor)166 ufs_trans_iupdat(struct inode *ip, int waitfor)
167 {
168 struct ufsvfs *ufsvfsp;
169
170 if (curthread->t_flag & T_DONTBLOCK) {
171 rw_enter(&ip->i_contents, RW_READER);
172 ufs_iupdat(ip, waitfor);
173 rw_exit(&ip->i_contents);
174 return;
175 } else {
176 ufsvfsp = ip->i_ufsvfs;
177
178 if (panicstr && TRANS_ISTRANS(ufsvfsp))
179 return;
180
181 curthread->t_flag |= T_DONTBLOCK;
182 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
183 rw_enter(&ip->i_contents, RW_READER);
184 ufs_iupdat(ip, waitfor);
185 rw_exit(&ip->i_contents);
186 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT, TOP_IUPDAT_SIZE(ip));
187 curthread->t_flag &= ~T_DONTBLOCK;
188 }
189 }
190
191 void
ufs_trans_sbwrite(struct ufsvfs * ufsvfsp,top_t topid)192 ufs_trans_sbwrite(struct ufsvfs *ufsvfsp, top_t topid)
193 {
194 if (curthread->t_flag & T_DONTBLOCK) {
195 mutex_enter(&ufsvfsp->vfs_lock);
196 ufs_sbwrite(ufsvfsp);
197 mutex_exit(&ufsvfsp->vfs_lock);
198 return;
199 } else {
200
201 if (panicstr && TRANS_ISTRANS(ufsvfsp))
202 return;
203
204 curthread->t_flag |= T_DONTBLOCK;
205 TRANS_BEGIN_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
206 mutex_enter(&ufsvfsp->vfs_lock);
207 ufs_sbwrite(ufsvfsp);
208 mutex_exit(&ufsvfsp->vfs_lock);
209 TRANS_END_ASYNC(ufsvfsp, topid, TOP_SBWRITE_SIZE);
210 curthread->t_flag &= ~T_DONTBLOCK;
211 }
212 }
213
214 /*ARGSUSED*/
215 int
ufs_trans_push_si(ufsvfs_t * ufsvfsp,delta_t dtyp,int ignore)216 ufs_trans_push_si(ufsvfs_t *ufsvfsp, delta_t dtyp, int ignore)
217 {
218 struct fs *fs;
219
220 fs = ufsvfsp->vfs_fs;
221 mutex_enter(&ufsvfsp->vfs_lock);
222 TRANS_LOG(ufsvfsp, (char *)fs->fs_u.fs_csp,
223 ldbtob(fsbtodb(fs, fs->fs_csaddr)), fs->fs_cssize,
224 (caddr_t)fs->fs_u.fs_csp, fs->fs_cssize);
225 mutex_exit(&ufsvfsp->vfs_lock);
226 return (0);
227 }
228
229 /*ARGSUSED*/
230 int
ufs_trans_push_buf(ufsvfs_t * ufsvfsp,delta_t dtyp,daddr_t bno)231 ufs_trans_push_buf(ufsvfs_t *ufsvfsp, delta_t dtyp, daddr_t bno)
232 {
233 struct buf *bp;
234
235 bp = (struct buf *)UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, 1);
236 if (bp == NULL)
237 return (ENOENT);
238
239 if (bp->b_flags & B_DELWRI) {
240 /*
241 * Do not use brwrite() here since the buffer is already
242 * marked for retry or not by the code that called
243 * TRANS_BUF().
244 */
245 UFS_BWRITE(ufsvfsp, bp);
246 return (0);
247 }
248 /*
249 * If we did not find the real buf for this block above then
250 * clear the dev so the buf won't be found by mistake
251 * for this block later. We had to allocate at least a 1 byte
252 * buffer to keep brelse happy.
253 */
254 if (bp->b_bufsize == 1) {
255 bp->b_dev = (o_dev_t)NODEV;
256 bp->b_edev = NODEV;
257 bp->b_flags = 0;
258 }
259 brelse(bp);
260 return (ENOENT);
261 }
262
263 /*ARGSUSED*/
264 int
ufs_trans_push_inode(ufsvfs_t * ufsvfsp,delta_t dtyp,ino_t ino)265 ufs_trans_push_inode(ufsvfs_t *ufsvfsp, delta_t dtyp, ino_t ino)
266 {
267 int error;
268 struct inode *ip;
269
270 /*
271 * Grab the quota lock (if the file system has not been forcibly
272 * unmounted).
273 */
274 if (ufsvfsp)
275 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
276
277 error = ufs_iget(ufsvfsp->vfs_vfs, ino, &ip, kcred);
278
279 if (ufsvfsp)
280 rw_exit(&ufsvfsp->vfs_dqrwlock);
281 if (error)
282 return (ENOENT);
283
284 if (ip->i_flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) {
285 rw_enter(&ip->i_contents, RW_READER);
286 ufs_iupdat(ip, 1);
287 rw_exit(&ip->i_contents);
288 VN_RELE(ITOV(ip));
289 return (0);
290 }
291 VN_RELE(ITOV(ip));
292 return (ENOENT);
293 }
294
295 #ifdef DEBUG
296 /*
297 * These routines maintain the metadata map (matamap)
298 */
299
300 /*
301 * update the metadata map at mount
302 */
303 static int
ufs_trans_mata_mount_scan(struct inode * ip,void * arg)304 ufs_trans_mata_mount_scan(struct inode *ip, void *arg)
305 {
306 /*
307 * wrong file system; keep looking
308 */
309 if (ip->i_ufsvfs != (struct ufsvfs *)arg)
310 return (0);
311
312 /*
313 * load the metadata map
314 */
315 rw_enter(&ip->i_contents, RW_WRITER);
316 ufs_trans_mata_iget(ip);
317 rw_exit(&ip->i_contents);
318 return (0);
319 }
320
321 void
ufs_trans_mata_mount(struct ufsvfs * ufsvfsp)322 ufs_trans_mata_mount(struct ufsvfs *ufsvfsp)
323 {
324 struct fs *fs = ufsvfsp->vfs_fs;
325 ino_t ino;
326 int i;
327
328 /*
329 * put static metadata into matamap
330 * superblock
331 * cylinder groups
332 * inode groups
333 * existing inodes
334 */
335 TRANS_MATAADD(ufsvfsp, ldbtob(SBLOCK), fs->fs_sbsize);
336
337 for (ino = i = 0; i < fs->fs_ncg; ++i, ino += fs->fs_ipg) {
338 TRANS_MATAADD(ufsvfsp,
339 ldbtob(fsbtodb(fs, cgtod(fs, i))), fs->fs_cgsize);
340 TRANS_MATAADD(ufsvfsp,
341 ldbtob(fsbtodb(fs, itod(fs, ino))),
342 fs->fs_ipg * sizeof (struct dinode));
343 }
344 (void) ufs_scan_inodes(0, ufs_trans_mata_mount_scan, ufsvfsp, ufsvfsp);
345 }
346
347 /*
348 * clear the metadata map at umount
349 */
350 void
ufs_trans_mata_umount(struct ufsvfs * ufsvfsp)351 ufs_trans_mata_umount(struct ufsvfs *ufsvfsp)
352 {
353 top_mataclr(ufsvfsp);
354 }
355
356 /*
357 * summary info (may be extended during growfs test)
358 */
359 void
ufs_trans_mata_si(struct ufsvfs * ufsvfsp,struct fs * fs)360 ufs_trans_mata_si(struct ufsvfs *ufsvfsp, struct fs *fs)
361 {
362 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, fs->fs_csaddr)),
363 fs->fs_cssize);
364 }
365
366 /*
367 * scan an allocation block (either inode or true block)
368 */
369 static void
ufs_trans_mata_direct(struct inode * ip,daddr_t * fragsp,daddr32_t * blkp,unsigned int nblk)370 ufs_trans_mata_direct(
371 struct inode *ip,
372 daddr_t *fragsp,
373 daddr32_t *blkp,
374 unsigned int nblk)
375 {
376 int i;
377 daddr_t frag;
378 ulong_t nb;
379 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
380 struct fs *fs = ufsvfsp->vfs_fs;
381
382 for (i = 0; i < nblk && *fragsp; ++i, ++blkp)
383 if ((frag = *blkp) != 0) {
384 if (*fragsp > fs->fs_frag) {
385 nb = fs->fs_bsize;
386 *fragsp -= fs->fs_frag;
387 } else {
388 nb = *fragsp * fs->fs_fsize;
389 *fragsp = 0;
390 }
391 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
392 }
393 }
394
395 /*
396 * scan an indirect allocation block (either inode or true block)
397 */
398 static void
ufs_trans_mata_indir(struct inode * ip,daddr_t * fragsp,daddr_t frag,int level)399 ufs_trans_mata_indir(
400 struct inode *ip,
401 daddr_t *fragsp,
402 daddr_t frag,
403 int level)
404 {
405 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
406 struct fs *fs = ufsvfsp->vfs_fs;
407 int ne = fs->fs_bsize / (int)sizeof (daddr32_t);
408 int i;
409 struct buf *bp;
410 daddr32_t *blkp;
411 o_mode_t ifmt = ip->i_mode & IFMT;
412
413 bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, frag), fs->fs_bsize);
414 if (bp->b_flags & B_ERROR) {
415 brelse(bp);
416 return;
417 }
418 blkp = bp->b_un.b_daddr;
419
420 if (level || (ifmt == IFDIR) || (ifmt == IFSHAD) ||
421 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod))
422 ufs_trans_mata_direct(ip, fragsp, blkp, ne);
423
424 if (level)
425 for (i = 0; i < ne && *fragsp; ++i, ++blkp)
426 ufs_trans_mata_indir(ip, fragsp, *blkp, level-1);
427 brelse(bp);
428 }
429
430 /*
431 * put appropriate metadata into matamap for this inode
432 */
433 void
ufs_trans_mata_iget(struct inode * ip)434 ufs_trans_mata_iget(struct inode *ip)
435 {
436 int i;
437 daddr_t frags = dbtofsb(ip->i_fs, ip->i_blocks);
438 o_mode_t ifmt = ip->i_mode & IFMT;
439
440 if (frags && ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
441 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
442 ufs_trans_mata_direct(ip, &frags, &ip->i_db[0], NDADDR);
443
444 if (frags)
445 ufs_trans_mata_direct(ip, &frags, &ip->i_ib[0], NIADDR);
446
447 for (i = 0; i < NIADDR && frags; ++i)
448 if (ip->i_ib[i])
449 ufs_trans_mata_indir(ip, &frags, ip->i_ib[i], i);
450 }
451
452 /*
453 * freeing possible metadata (block of user data)
454 */
455 void
ufs_trans_mata_free(struct ufsvfs * ufsvfsp,offset_t mof,off_t nb)456 ufs_trans_mata_free(struct ufsvfs *ufsvfsp, offset_t mof, off_t nb)
457 {
458 top_matadel(ufsvfsp, mof, nb);
459
460 }
461
462 /*
463 * allocating metadata
464 */
465 void
ufs_trans_mata_alloc(struct ufsvfs * ufsvfsp,struct inode * ip,daddr_t frag,ulong_t nb,int indir)466 ufs_trans_mata_alloc(
467 struct ufsvfs *ufsvfsp,
468 struct inode *ip,
469 daddr_t frag,
470 ulong_t nb,
471 int indir)
472 {
473 struct fs *fs = ufsvfsp->vfs_fs;
474 o_mode_t ifmt = ip->i_mode & IFMT;
475
476 if (indir || ((ifmt == IFDIR) || (ifmt == IFSHAD) ||
477 (ifmt == IFATTRDIR) || (ip == ip->i_ufsvfs->vfs_qinod)))
478 TRANS_MATAADD(ufsvfsp, ldbtob(fsbtodb(fs, frag)), nb);
479 }
480
481 #endif /* DEBUG */
482
483 /*
484 * ufs_trans_dir is used to declare a directory delta
485 */
486 int
ufs_trans_dir(struct inode * ip,off_t offset)487 ufs_trans_dir(struct inode *ip, off_t offset)
488 {
489 daddr_t bn;
490 int contig = 0, error;
491
492 ASSERT(ip);
493 ASSERT(RW_WRITE_HELD(&ip->i_contents));
494 error = bmap_read(ip, (u_offset_t)offset, &bn, &contig);
495 if (error || (bn == UFS_HOLE)) {
496 cmn_err(CE_WARN, "ufs_trans_dir - could not get block"
497 " number error = %d bn = %d\n", error, (int)bn);
498 if (error == 0) /* treat UFS_HOLE as an I/O error */
499 error = EIO;
500 return (error);
501 }
502 TRANS_DELTA(ip->i_ufsvfs, ldbtob(bn), DIRBLKSIZ, DT_DIR, 0, 0);
503 return (error);
504 }
505
506 /*ARGSUSED*/
507 int
ufs_trans_push_quota(ufsvfs_t * ufsvfsp,delta_t dtyp,struct dquot * dqp)508 ufs_trans_push_quota(ufsvfs_t *ufsvfsp, delta_t dtyp, struct dquot *dqp)
509 {
510 /*
511 * Lock the quota subsystem (ufsvfsp can be NULL
512 * if the DQ_ERROR is set).
513 */
514 if (ufsvfsp)
515 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
516 mutex_enter(&dqp->dq_lock);
517
518 /*
519 * If this transaction has been cancelled by closedq_scan_inode(),
520 * then bail out now. We don't call dqput() in this case because
521 * it has already been done.
522 */
523 if ((dqp->dq_flags & DQ_TRANS) == 0) {
524 mutex_exit(&dqp->dq_lock);
525 if (ufsvfsp)
526 rw_exit(&ufsvfsp->vfs_dqrwlock);
527 return (0);
528 }
529
530 if (dqp->dq_flags & DQ_ERROR) {
531 /*
532 * Paranoia to make sure that there is at least one
533 * reference to the dquot struct. We are done with
534 * the dquot (due to an error) so clear logging
535 * specific markers.
536 */
537 ASSERT(dqp->dq_cnt >= 1);
538 dqp->dq_flags &= ~DQ_TRANS;
539 dqput(dqp);
540 mutex_exit(&dqp->dq_lock);
541 if (ufsvfsp)
542 rw_exit(&ufsvfsp->vfs_dqrwlock);
543 return (1);
544 }
545
546 if (dqp->dq_flags & (DQ_MOD | DQ_BLKS | DQ_FILES)) {
547 ASSERT((dqp->dq_mof != UFS_HOLE) && (dqp->dq_mof != 0));
548 TRANS_LOG(ufsvfsp, (caddr_t)&dqp->dq_dqb,
549 dqp->dq_mof, (int)sizeof (struct dqblk), NULL, 0);
550 /*
551 * Paranoia to make sure that there is at least one
552 * reference to the dquot struct. Clear the
553 * modification flag because the operation is now in
554 * the log. Also clear the logging specific markers
555 * that were set in ufs_trans_quota().
556 */
557 ASSERT(dqp->dq_cnt >= 1);
558 dqp->dq_flags &= ~(DQ_MOD | DQ_TRANS);
559 dqput(dqp);
560 }
561
562 /*
563 * At this point, the logging specific flag should be clear,
564 * but add paranoia just in case something has gone wrong.
565 */
566 ASSERT((dqp->dq_flags & DQ_TRANS) == 0);
567 mutex_exit(&dqp->dq_lock);
568 if (ufsvfsp)
569 rw_exit(&ufsvfsp->vfs_dqrwlock);
570 return (0);
571 }
572
573 /*
574 * ufs_trans_quota take in a uid, allocates the disk space, placing the
575 * quota record into the metamap, then declares the delta.
576 */
577 /*ARGSUSED*/
578 void
ufs_trans_quota(struct dquot * dqp)579 ufs_trans_quota(struct dquot *dqp)
580 {
581
582 struct inode *qip = dqp->dq_ufsvfsp->vfs_qinod;
583
584 ASSERT(qip);
585 ASSERT(MUTEX_HELD(&dqp->dq_lock));
586 ASSERT(dqp->dq_flags & DQ_MOD);
587 ASSERT(dqp->dq_mof != 0);
588 ASSERT(dqp->dq_mof != UFS_HOLE);
589
590 /*
591 * Mark this dquot to indicate that we are starting a logging
592 * file system operation for this dquot. Also increment the
593 * reference count so that the dquot does not get reused while
594 * it is on the mapentry_t list. DQ_TRANS is cleared and the
595 * reference count is decremented by ufs_trans_push_quota.
596 *
597 * If the file system is force-unmounted while there is a
598 * pending quota transaction, then closedq_scan_inode() will
599 * clear the DQ_TRANS flag and decrement the reference count.
600 *
601 * Since deltamap_add() drops multiple transactions to the
602 * same dq_mof and ufs_trans_push_quota() won't get called,
603 * we use DQ_TRANS to prevent repeat transactions from
604 * incrementing the reference count (or calling TRANS_DELTA()).
605 */
606 if ((dqp->dq_flags & DQ_TRANS) == 0) {
607 dqp->dq_flags |= DQ_TRANS;
608 dqp->dq_cnt++;
609 TRANS_DELTA(qip->i_ufsvfs, dqp->dq_mof, sizeof (struct dqblk),
610 DT_QR, ufs_trans_push_quota, (ulong_t)dqp);
611 }
612 }
613
614 void
ufs_trans_dqrele(struct dquot * dqp)615 ufs_trans_dqrele(struct dquot *dqp)
616 {
617 struct ufsvfs *ufsvfsp = dqp->dq_ufsvfsp;
618
619 curthread->t_flag |= T_DONTBLOCK;
620 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
621 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
622 dqrele(dqp);
623 rw_exit(&ufsvfsp->vfs_dqrwlock);
624 TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA, TOP_QUOTA_SIZE);
625 curthread->t_flag &= ~T_DONTBLOCK;
626 }
627
628 int ufs_trans_max_resv = TOP_MAX_RESV; /* will be adjusted for testing */
629 long ufs_trans_avgbfree = 0; /* will be adjusted for testing */
630 #define TRANS_MAX_WRITE (1024 * 1024)
631 size_t ufs_trans_max_resid = TRANS_MAX_WRITE;
632
633 /*
634 * Calculate the log reservation for the given write or truncate
635 */
636 static ulong_t
ufs_log_amt(struct inode * ip,offset_t offset,ssize_t resid,int trunc)637 ufs_log_amt(struct inode *ip, offset_t offset, ssize_t resid, int trunc)
638 {
639 long ncg, last2blk;
640 long niblk = 0;
641 u_offset_t writeend, offblk;
642 int resv;
643 daddr_t nblk, maxfblk;
644 long avgbfree;
645 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
646 struct fs *fs = ufsvfsp->vfs_fs;
647 long fni = NINDIR(fs);
648 int bsize = fs->fs_bsize;
649
650 /*
651 * Assume that the request will fit in 1 or 2 cg's,
652 * resv is the amount of log space to reserve (in bytes).
653 */
654 resv = SIZECG(ip) * 2 + INODESIZE + 1024;
655
656 /*
657 * get max position of write in fs blocks
658 */
659 writeend = offset + resid;
660 maxfblk = lblkno(fs, writeend);
661 offblk = lblkno(fs, offset);
662 /*
663 * request size in fs blocks
664 */
665 nblk = lblkno(fs, blkroundup(fs, resid));
666 /*
667 * Adjust for sparse files
668 */
669 if (trunc)
670 nblk = MIN(nblk, ip->i_blocks);
671
672 /*
673 * Adjust avgbfree (for testing)
674 */
675 avgbfree = (ufs_trans_avgbfree) ? 1 : ufsvfsp->vfs_avgbfree + 1;
676
677 /*
678 * Calculate maximum number of blocks of triple indirect
679 * pointers to write.
680 */
681 last2blk = NDADDR + fni + fni * fni;
682 if (maxfblk > last2blk) {
683 long nl2ptr;
684 long n3blk;
685
686 if (offblk > last2blk)
687 n3blk = maxfblk - offblk;
688 else
689 n3blk = maxfblk - last2blk;
690 niblk += roundup(n3blk * sizeof (daddr_t), bsize) / bsize + 1;
691 nl2ptr = roundup(niblk, fni) / fni + 1;
692 niblk += roundup(nl2ptr * sizeof (daddr_t), bsize) / bsize + 2;
693 maxfblk -= n3blk;
694 }
695 /*
696 * calculate maximum number of blocks of double indirect
697 * pointers to write.
698 */
699 if (maxfblk > NDADDR + fni) {
700 long n2blk;
701
702 if (offblk > NDADDR + fni)
703 n2blk = maxfblk - offblk;
704 else
705 n2blk = maxfblk - NDADDR + fni;
706 niblk += roundup(n2blk * sizeof (daddr_t), bsize) / bsize + 2;
707 maxfblk -= n2blk;
708 }
709 /*
710 * Add in indirect pointer block write
711 */
712 if (maxfblk > NDADDR) {
713 niblk += 1;
714 }
715 /*
716 * Calculate deltas for indirect pointer writes
717 */
718 resv += niblk * (fs->fs_bsize + sizeof (struct delta));
719 /*
720 * maximum number of cg's needed for request
721 */
722 ncg = nblk / avgbfree;
723 if (ncg > fs->fs_ncg)
724 ncg = fs->fs_ncg;
725
726 /*
727 * maximum amount of log space needed for request
728 */
729 if (ncg > 2)
730 resv += (ncg - 2) * SIZECG(ip);
731
732 return (resv);
733 }
734
735 /*
736 * Calculate the amount of log space that needs to be reserved for this
737 * trunc request. If the amount of log space is too large, then
738 * calculate the the size that the requests needs to be split into.
739 */
740 void
ufs_trans_trunc_resv(struct inode * ip,u_offset_t length,int * resvp,u_offset_t * residp)741 ufs_trans_trunc_resv(
742 struct inode *ip,
743 u_offset_t length,
744 int *resvp,
745 u_offset_t *residp)
746 {
747 ulong_t resv;
748 u_offset_t size, offset, resid;
749 int nchunks, flag;
750
751 /*
752 * *resvp is the amount of log space to reserve (in bytes).
753 * when nonzero, *residp is the number of bytes to truncate.
754 */
755 *residp = 0;
756
757 if (length < ip->i_size) {
758 size = ip->i_size - length;
759 } else {
760 resv = SIZECG(ip) * 2 + INODESIZE + 1024;
761 /*
762 * truncate up, doesn't really use much space,
763 * the default above should be sufficient.
764 */
765 goto done;
766 }
767
768 offset = length;
769 resid = size;
770 nchunks = 1;
771 flag = 0;
772
773 /*
774 * If this request takes too much log space, it will be split into
775 * "nchunks". If this split is not enough, linearly increment the
776 * nchunks in the next iteration.
777 */
778 for (; (resv = ufs_log_amt(ip, offset, resid, 1)) > ufs_trans_max_resv;
779 offset = length + (nchunks - 1) * resid) {
780 if (!flag) {
781 nchunks = roundup(resv, ufs_trans_max_resv) /
782 ufs_trans_max_resv;
783 flag = 1;
784 } else {
785 nchunks++;
786 }
787 resid = size / nchunks;
788 }
789
790 if (nchunks > 1) {
791 *residp = resid;
792 }
793 done:
794 *resvp = resv;
795 }
796
797 int
ufs_trans_itrunc(struct inode * ip,u_offset_t length,int flags,cred_t * cr)798 ufs_trans_itrunc(struct inode *ip, u_offset_t length, int flags, cred_t *cr)
799 {
800 int err, issync, resv;
801 u_offset_t resid;
802 int do_block = 0;
803 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
804 struct fs *fs = ufsvfsp->vfs_fs;
805
806 /*
807 * Not logging; just do the trunc
808 */
809 if (!TRANS_ISTRANS(ufsvfsp)) {
810 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
811 rw_enter(&ip->i_contents, RW_WRITER);
812 err = ufs_itrunc(ip, length, flags, cr);
813 rw_exit(&ip->i_contents);
814 rw_exit(&ufsvfsp->vfs_dqrwlock);
815 return (err);
816 }
817
818 /*
819 * within the lockfs protocol but *not* part of a transaction
820 */
821 do_block = curthread->t_flag & T_DONTBLOCK;
822 curthread->t_flag |= T_DONTBLOCK;
823
824 /*
825 * Trunc the file (in pieces, if necessary)
826 */
827 again:
828 ufs_trans_trunc_resv(ip, length, &resv, &resid);
829 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ITRUNC, resv);
830 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
831 rw_enter(&ip->i_contents, RW_WRITER);
832 if (resid) {
833 /*
834 * resid is only set if we have to truncate in chunks
835 */
836 ASSERT(length + resid < ip->i_size);
837
838 /*
839 * Partially trunc file down to desired size (length).
840 * Only retain I_FREE on the last partial trunc.
841 * Round up size to a block boundary, to ensure the truncate
842 * doesn't have to allocate blocks. This is done both for
843 * performance and to fix a bug where if the block can't be
844 * allocated then the inode delete fails, but the inode
845 * is still freed with attached blocks and non-zero size
846 * (bug 4348738).
847 */
848 err = ufs_itrunc(ip, blkroundup(fs, (ip->i_size - resid)),
849 flags & ~I_FREE, cr);
850 ASSERT(ip->i_size != length);
851 } else
852 err = ufs_itrunc(ip, length, flags, cr);
853 if (!do_block)
854 curthread->t_flag &= ~T_DONTBLOCK;
855 rw_exit(&ip->i_contents);
856 rw_exit(&ufsvfsp->vfs_dqrwlock);
857 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ITRUNC, resv);
858
859 if ((err == 0) && resid) {
860 ufsvfsp->vfs_avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
861 goto again;
862 }
863 return (err);
864 }
865
866 /*
867 * Calculate the amount of log space that needs to be reserved for this
868 * write request. If the amount of log space is too large, then
869 * calculate the size that the requests needs to be split into.
870 * First try fixed chunks of size ufs_trans_max_resid. If that
871 * is too big, iterate down to the largest size that will fit.
872 * Pagein the pages in the first chunk here, so that the pagein is
873 * avoided later when the transaction is open.
874 */
875 void
ufs_trans_write_resv(struct inode * ip,struct uio * uio,int * resvp,int * residp)876 ufs_trans_write_resv(
877 struct inode *ip,
878 struct uio *uio,
879 int *resvp,
880 int *residp)
881 {
882 ulong_t resv;
883 offset_t offset;
884 ssize_t resid;
885 int nchunks;
886
887 *residp = 0;
888 offset = uio->uio_offset;
889 resid = MIN(uio->uio_resid, ufs_trans_max_resid);
890 resv = ufs_log_amt(ip, offset, resid, 0);
891 if (resv <= ufs_trans_max_resv) {
892 uio_prefaultpages(resid, uio);
893 if (resid != uio->uio_resid)
894 *residp = resid;
895 *resvp = resv;
896 return;
897 }
898
899 resid = uio->uio_resid;
900 nchunks = 1;
901 for (; (resv = ufs_log_amt(ip, offset, resid, 0)) > ufs_trans_max_resv;
902 offset = uio->uio_offset + (nchunks - 1) * resid) {
903 nchunks++;
904 resid = uio->uio_resid / nchunks;
905 }
906 uio_prefaultpages(resid, uio);
907 /*
908 * If this request takes too much log space, it will be split
909 */
910 if (nchunks > 1)
911 *residp = resid;
912 *resvp = resv;
913 }
914
915 /*
916 * Issue write request.
917 *
918 * Split a large request into smaller chunks.
919 */
920 int
ufs_trans_write(struct inode * ip,struct uio * uio,int ioflag,cred_t * cr,int resv,long resid)921 ufs_trans_write(
922 struct inode *ip,
923 struct uio *uio,
924 int ioflag,
925 cred_t *cr,
926 int resv,
927 long resid)
928 {
929 long realresid;
930 int err;
931 struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
932
933 /*
934 * since the write is too big and would "HOG THE LOG" it needs to
935 * be broken up and done in pieces. NOTE, the caller will
936 * issue the EOT after the request has been completed
937 */
938 realresid = uio->uio_resid;
939
940 again:
941 /*
942 * Perform partial request (uiomove will update uio for us)
943 * Request is split up into "resid" size chunks until
944 * "realresid" bytes have been transferred.
945 */
946 uio->uio_resid = MIN(resid, realresid);
947 realresid -= uio->uio_resid;
948 err = wrip(ip, uio, ioflag, cr);
949
950 /*
951 * Error or request is done; caller issues final EOT
952 */
953 if (err || uio->uio_resid || (realresid == 0)) {
954 uio->uio_resid += realresid;
955 return (err);
956 }
957
958 /*
959 * Generate EOT for this part of the request
960 */
961 rw_exit(&ip->i_contents);
962 rw_exit(&ufsvfsp->vfs_dqrwlock);
963 if (ioflag & (FSYNC|FDSYNC)) {
964 TRANS_END_SYNC(ufsvfsp, err, TOP_WRITE_SYNC, resv);
965 } else {
966 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
967 }
968
969 /*
970 * Make sure the input buffer is resident before starting
971 * the next transaction.
972 */
973 uio_prefaultpages(MIN(resid, realresid), uio);
974
975 /*
976 * Generate BOT for next part of the request
977 */
978 if (ioflag & (FSYNC|FDSYNC)) {
979 int error;
980 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, error);
981 ASSERT(!error);
982 } else {
983 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
984 }
985 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
986 rw_enter(&ip->i_contents, RW_WRITER);
987 /*
988 * Error during EOT (probably device error while writing commit rec)
989 */
990 if (err)
991 return (err);
992 goto again;
993 }
994