1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/buf.h>
30 #include <sys/errno.h>
31 #include <sys/fssnap_if.h>
32 #include <sys/fs/ufs_inode.h>
33 #include <sys/fs/ufs_filio.h>
34 #include <sys/sysmacros.h>
35 #include <sys/modctl.h>
36 #include <sys/fs/ufs_log.h>
37 #include <sys/fs/ufs_bio.h>
38 #include <sys/fs/ufs_fsdir.h>
39 #include <sys/debug.h>
40 #include <sys/atomic.h>
41 #include <sys/kmem.h>
42 #include <sys/inttypes.h>
43 #include <sys/vfs.h>
44 #include <sys/mntent.h>
45 #include <sys/conf.h>
46 #include <sys/param.h>
47 #include <sys/kstat.h>
48 #include <sys/cmn_err.h>
49 #include <sys/sdt.h>
50
51 #define LUFS_GENID_PRIME UINT64_C(4294967291)
52 #define LUFS_GENID_BASE UINT64_C(311)
53 #define LUFS_NEXT_ID(id) ((uint32_t)(((id) * LUFS_GENID_BASE) % \
54 LUFS_GENID_PRIME))
55
56 extern kmutex_t ufs_scan_lock;
57
58 static kmutex_t log_mutex; /* general purpose log layer lock */
59 kmutex_t ml_scan; /* Scan thread syncronization */
60 kcondvar_t ml_scan_cv; /* Scan thread syncronization */
61
62 struct kmem_cache *lufs_sv;
63 struct kmem_cache *lufs_bp;
64
65 /* Tunables */
66 uint_t ldl_maxlogsize = LDL_MAXLOGSIZE;
67 uint_t ldl_minlogsize = LDL_MINLOGSIZE;
68 uint_t ldl_softlogcap = LDL_SOFTLOGCAP;
69 uint32_t ldl_divisor = LDL_DIVISOR;
70 uint32_t ldl_mintransfer = LDL_MINTRANSFER;
71 uint32_t ldl_maxtransfer = LDL_MAXTRANSFER;
72 uint32_t ldl_minbufsize = LDL_MINBUFSIZE;
73 uint32_t ldl_cgsizereq = 0;
74
75 /* Generation of header ids */
76 static kmutex_t genid_mutex;
77 static uint32_t last_loghead_ident = UINT32_C(0);
78
79 /*
80 * Logging delta and roll statistics
81 */
82 struct delta_kstats {
83 kstat_named_t ds_superblock_deltas;
84 kstat_named_t ds_bitmap_deltas;
85 kstat_named_t ds_suminfo_deltas;
86 kstat_named_t ds_allocblk_deltas;
87 kstat_named_t ds_ab0_deltas;
88 kstat_named_t ds_dir_deltas;
89 kstat_named_t ds_inode_deltas;
90 kstat_named_t ds_fbiwrite_deltas;
91 kstat_named_t ds_quota_deltas;
92 kstat_named_t ds_shadow_deltas;
93
94 kstat_named_t ds_superblock_rolled;
95 kstat_named_t ds_bitmap_rolled;
96 kstat_named_t ds_suminfo_rolled;
97 kstat_named_t ds_allocblk_rolled;
98 kstat_named_t ds_ab0_rolled;
99 kstat_named_t ds_dir_rolled;
100 kstat_named_t ds_inode_rolled;
101 kstat_named_t ds_fbiwrite_rolled;
102 kstat_named_t ds_quota_rolled;
103 kstat_named_t ds_shadow_rolled;
104 } dkstats = {
105 { "superblock_deltas", KSTAT_DATA_UINT64 },
106 { "bitmap_deltas", KSTAT_DATA_UINT64 },
107 { "suminfo_deltas", KSTAT_DATA_UINT64 },
108 { "allocblk_deltas", KSTAT_DATA_UINT64 },
109 { "ab0_deltas", KSTAT_DATA_UINT64 },
110 { "dir_deltas", KSTAT_DATA_UINT64 },
111 { "inode_deltas", KSTAT_DATA_UINT64 },
112 { "fbiwrite_deltas", KSTAT_DATA_UINT64 },
113 { "quota_deltas", KSTAT_DATA_UINT64 },
114 { "shadow_deltas", KSTAT_DATA_UINT64 },
115
116 { "superblock_rolled", KSTAT_DATA_UINT64 },
117 { "bitmap_rolled", KSTAT_DATA_UINT64 },
118 { "suminfo_rolled", KSTAT_DATA_UINT64 },
119 { "allocblk_rolled", KSTAT_DATA_UINT64 },
120 { "ab0_rolled", KSTAT_DATA_UINT64 },
121 { "dir_rolled", KSTAT_DATA_UINT64 },
122 { "inode_rolled", KSTAT_DATA_UINT64 },
123 { "fbiwrite_rolled", KSTAT_DATA_UINT64 },
124 { "quota_rolled", KSTAT_DATA_UINT64 },
125 { "shadow_rolled", KSTAT_DATA_UINT64 }
126 };
127
128 uint64_t delta_stats[DT_MAX];
129 uint64_t roll_stats[DT_MAX];
130
131 /*
132 * General logging kstats
133 */
134 struct logstats logstats = {
135 { "master_reads", KSTAT_DATA_UINT64 },
136 { "master_writes", KSTAT_DATA_UINT64 },
137 { "log_reads_inmem", KSTAT_DATA_UINT64 },
138 { "log_reads", KSTAT_DATA_UINT64 },
139 { "log_writes", KSTAT_DATA_UINT64 },
140 { "log_master_reads", KSTAT_DATA_UINT64 },
141 { "log_roll_reads", KSTAT_DATA_UINT64 },
142 { "log_roll_writes", KSTAT_DATA_UINT64 }
143 };
144
145 int
trans_not_done(struct buf * cb)146 trans_not_done(struct buf *cb)
147 {
148 sema_v(&cb->b_io);
149 return (0);
150 }
151
152 static void
trans_wait_panic(struct buf * cb)153 trans_wait_panic(struct buf *cb)
154 {
155 while ((cb->b_flags & B_DONE) == 0)
156 drv_usecwait(10);
157 }
158
159 int
trans_not_wait(struct buf * cb)160 trans_not_wait(struct buf *cb)
161 {
162 /*
163 * In case of panic, busy wait for completion
164 */
165 if (panicstr)
166 trans_wait_panic(cb);
167 else
168 sema_p(&cb->b_io);
169
170 return (geterror(cb));
171 }
172
173 int
trans_wait(struct buf * cb)174 trans_wait(struct buf *cb)
175 {
176 /*
177 * In case of panic, busy wait for completion and run md daemon queues
178 */
179 if (panicstr)
180 trans_wait_panic(cb);
181 return (biowait(cb));
182 }
183
184 static void
setsum(int32_t * sp,int32_t * lp,int nb)185 setsum(int32_t *sp, int32_t *lp, int nb)
186 {
187 int32_t csum = 0;
188
189 *sp = 0;
190 nb /= sizeof (int32_t);
191 while (nb--)
192 csum += *lp++;
193 *sp = csum;
194 }
195
196 static int
checksum(int32_t * sp,int32_t * lp,int nb)197 checksum(int32_t *sp, int32_t *lp, int nb)
198 {
199 int32_t ssum = *sp;
200
201 setsum(sp, lp, nb);
202 if (ssum != *sp) {
203 *sp = ssum;
204 return (0);
205 }
206 return (1);
207 }
208
209 void
lufs_unsnarf(ufsvfs_t * ufsvfsp)210 lufs_unsnarf(ufsvfs_t *ufsvfsp)
211 {
212 ml_unit_t *ul;
213 mt_map_t *mtm;
214
215 ul = ufsvfsp->vfs_log;
216 if (ul == NULL)
217 return;
218
219 mtm = ul->un_logmap;
220
221 /*
222 * Wait for a pending top_issue_sync which is
223 * dispatched (via taskq_dispatch()) but hasnt completed yet.
224 */
225
226 mutex_enter(&mtm->mtm_lock);
227
228 while (mtm->mtm_taskq_sync_count != 0) {
229 cv_wait(&mtm->mtm_cv, &mtm->mtm_lock);
230 }
231
232 mutex_exit(&mtm->mtm_lock);
233
234 /* Roll committed transactions */
235 logmap_roll_dev(ul);
236
237 /* Kill the roll thread */
238 logmap_kill_roll(ul);
239
240 /* release saved alloction info */
241 if (ul->un_ebp)
242 kmem_free(ul->un_ebp, ul->un_nbeb);
243
244 /* release circular bufs */
245 free_cirbuf(&ul->un_rdbuf);
246 free_cirbuf(&ul->un_wrbuf);
247
248 /* release maps */
249 if (ul->un_logmap)
250 ul->un_logmap = map_put(ul->un_logmap);
251 if (ul->un_deltamap)
252 ul->un_deltamap = map_put(ul->un_deltamap);
253 if (ul->un_matamap)
254 ul->un_matamap = map_put(ul->un_matamap);
255
256 mutex_destroy(&ul->un_log_mutex);
257 mutex_destroy(&ul->un_state_mutex);
258
259 /* release state buffer MUST BE LAST!! (contains our ondisk data) */
260 if (ul->un_bp)
261 brelse(ul->un_bp);
262 kmem_free(ul, sizeof (*ul));
263
264 ufsvfsp->vfs_log = NULL;
265 }
266
267 int
lufs_snarf(ufsvfs_t * ufsvfsp,struct fs * fs,int ronly)268 lufs_snarf(ufsvfs_t *ufsvfsp, struct fs *fs, int ronly)
269 {
270 buf_t *bp, *tbp;
271 ml_unit_t *ul;
272 extent_block_t *ebp;
273 ic_extent_block_t *nebp;
274 size_t nb;
275 daddr_t bno; /* in disk blocks */
276 int i;
277
278 /* LINTED: warning: logical expression always true: op "||" */
279 ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
280
281 /*
282 * Get the allocation table
283 * During a remount the superblock pointed to by the ufsvfsp
284 * is out of date. Hence the need for the ``new'' superblock
285 * pointer, fs, passed in as a parameter.
286 */
287 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, fs->fs_logbno),
288 fs->fs_bsize);
289 if (bp->b_flags & B_ERROR) {
290 brelse(bp);
291 return (EIO);
292 }
293 ebp = (void *)bp->b_un.b_addr;
294 if (!checksum(&ebp->chksum, (int32_t *)bp->b_un.b_addr,
295 fs->fs_bsize)) {
296 brelse(bp);
297 return (ENODEV);
298 }
299
300 /*
301 * It is possible to get log blocks with all zeros.
302 * We should also check for nextents to be zero in such case.
303 */
304 if (ebp->type != LUFS_EXTENTS || ebp->nextents == 0) {
305 brelse(bp);
306 return (EDOM);
307 }
308 /*
309 * Put allocation into memory. This requires conversion between
310 * on the ondisk format of the extent (type extent_t) and the
311 * in-core format of the extent (type ic_extent_t). The
312 * difference is the in-core form of the extent block stores
313 * the physical offset of the extent in disk blocks, which
314 * can require more than a 32-bit field.
315 */
316 nb = (size_t)(sizeof (ic_extent_block_t) +
317 ((ebp->nextents - 1) * sizeof (ic_extent_t)));
318 nebp = kmem_alloc(nb, KM_SLEEP);
319 nebp->ic_nextents = ebp->nextents;
320 nebp->ic_nbytes = ebp->nbytes;
321 nebp->ic_nextbno = ebp->nextbno;
322 for (i = 0; i < ebp->nextents; i++) {
323 nebp->ic_extents[i].ic_lbno = ebp->extents[i].lbno;
324 nebp->ic_extents[i].ic_nbno = ebp->extents[i].nbno;
325 nebp->ic_extents[i].ic_pbno =
326 logbtodb(fs, ebp->extents[i].pbno);
327 }
328 brelse(bp);
329
330 /*
331 * Get the log state
332 */
333 bno = nebp->ic_extents[0].ic_pbno;
334 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, DEV_BSIZE);
335 if (bp->b_flags & B_ERROR) {
336 brelse(bp);
337 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno + 1, DEV_BSIZE);
338 if (bp->b_flags & B_ERROR) {
339 brelse(bp);
340 kmem_free(nebp, nb);
341 return (EIO);
342 }
343 }
344
345 /*
346 * Put ondisk struct into an anonymous buffer
347 * This buffer will contain the memory for the ml_odunit struct
348 */
349 tbp = ngeteblk(dbtob(LS_SECTORS));
350 tbp->b_edev = bp->b_edev;
351 tbp->b_dev = bp->b_dev;
352 tbp->b_blkno = bno;
353 bcopy(bp->b_un.b_addr, tbp->b_un.b_addr, DEV_BSIZE);
354 bcopy(bp->b_un.b_addr, tbp->b_un.b_addr + DEV_BSIZE, DEV_BSIZE);
355 bp->b_flags |= (B_STALE | B_AGE);
356 brelse(bp);
357 bp = tbp;
358
359 /*
360 * Verify the log state
361 *
362 * read/only mounts w/bad logs are allowed. umount will
363 * eventually roll the bad log until the first IO error.
364 * fsck will then repair the file system.
365 *
366 * read/write mounts with bad logs are not allowed.
367 *
368 */
369 ul = (ml_unit_t *)kmem_zalloc(sizeof (*ul), KM_SLEEP);
370 bcopy(bp->b_un.b_addr, &ul->un_ondisk, sizeof (ml_odunit_t));
371 if ((ul->un_chksum != ul->un_head_ident + ul->un_tail_ident) ||
372 (ul->un_version != LUFS_VERSION_LATEST) ||
373 (!ronly && ul->un_badlog)) {
374 kmem_free(ul, sizeof (*ul));
375 brelse(bp);
376 kmem_free(nebp, nb);
377 return (EIO);
378 }
379 /*
380 * Initialize the incore-only fields
381 */
382 if (ronly)
383 ul->un_flags |= LDL_NOROLL;
384 ul->un_bp = bp;
385 ul->un_ufsvfs = ufsvfsp;
386 ul->un_dev = ufsvfsp->vfs_dev;
387 ul->un_ebp = nebp;
388 ul->un_nbeb = nb;
389 ul->un_maxresv = btodb(ul->un_logsize) * LDL_USABLE_BSIZE;
390 ul->un_deltamap = map_get(ul, deltamaptype, DELTAMAP_NHASH);
391 ul->un_logmap = map_get(ul, logmaptype, LOGMAP_NHASH);
392 if (ul->un_debug & MT_MATAMAP)
393 ul->un_matamap = map_get(ul, matamaptype, DELTAMAP_NHASH);
394 mutex_init(&ul->un_log_mutex, NULL, MUTEX_DEFAULT, NULL);
395 mutex_init(&ul->un_state_mutex, NULL, MUTEX_DEFAULT, NULL);
396
397 /*
398 * Aquire the ufs_scan_lock before linking the mtm data
399 * structure so that we keep ufs_sync() and ufs_update() away
400 * when they execute the ufs_scan_inodes() run while we're in
401 * progress of enabling/disabling logging.
402 */
403 mutex_enter(&ufs_scan_lock);
404 ufsvfsp->vfs_log = ul;
405
406 /* remember the state of the log before the log scan */
407 logmap_logscan(ul);
408 mutex_exit(&ufs_scan_lock);
409
410 /*
411 * Error during scan
412 *
413 * If this is a read/only mount; ignore the error.
414 * At a later time umount/fsck will repair the fs.
415 *
416 */
417 if (ul->un_flags & LDL_ERROR) {
418 if (!ronly) {
419 /*
420 * Aquire the ufs_scan_lock before de-linking
421 * the mtm data structure so that we keep ufs_sync()
422 * and ufs_update() away when they execute the
423 * ufs_scan_inodes() run while we're in progress of
424 * enabling/disabling logging.
425 */
426 mutex_enter(&ufs_scan_lock);
427 lufs_unsnarf(ufsvfsp);
428 mutex_exit(&ufs_scan_lock);
429 return (EIO);
430 }
431 ul->un_flags &= ~LDL_ERROR;
432 }
433 if (!ronly)
434 logmap_start_roll(ul);
435 return (0);
436 }
437
438 uint32_t
lufs_hd_genid(const ml_unit_t * up)439 lufs_hd_genid(const ml_unit_t *up)
440 {
441 uint32_t id;
442
443 mutex_enter(&genid_mutex);
444
445 /*
446 * The formula below implements an exponential, modular sequence.
447 *
448 * ID(N) = (SEED * (BASE^N)) % PRIME
449 *
450 * The numbers will be pseudo random. They depend on SEED, BASE, PRIME,
451 * but will sweep through almost all of the range 1....PRIME-1.
452 * Most importantly they will not repeat for PRIME-2 (4294967289)
453 * repetitions. If they would repeat that could possibly cause hangs,
454 * panics at mount/umount and failed mount operations.
455 */
456 id = LUFS_NEXT_ID(last_loghead_ident);
457
458 /* Checking if new identity used already */
459 if (up != NULL && up->un_head_ident == id) {
460 DTRACE_PROBE1(head_ident_collision, uint32_t, id);
461
462 /*
463 * The following preserves the algorithm for the fix for
464 * "panic: free: freeing free frag, dev:0x2000000018, blk:34605,
465 * cg:26, ino:148071,".
466 * If the header identities un_head_ident are equal to the
467 * present element in the sequence, the next element of the
468 * sequence is returned instead.
469 */
470 id = LUFS_NEXT_ID(id);
471 }
472
473 last_loghead_ident = id;
474
475 mutex_exit(&genid_mutex);
476
477 return (id);
478 }
479
480 static void
lufs_genid_init(void)481 lufs_genid_init(void)
482 {
483 uint64_t seed;
484
485 /* Initialization */
486 mutex_init(&genid_mutex, NULL, MUTEX_DEFAULT, NULL);
487
488 /* Seed the algorithm */
489 do {
490 timestruc_t tv;
491
492 gethrestime(&tv);
493
494 seed = (tv.tv_nsec << 3);
495 seed ^= tv.tv_sec;
496
497 last_loghead_ident = (uint32_t)(seed % LUFS_GENID_PRIME);
498 } while (last_loghead_ident == UINT32_C(0));
499 }
500
501 static int
lufs_initialize(ufsvfs_t * ufsvfsp,daddr_t bno,size_t nb,struct fiolog * flp)502 lufs_initialize(
503 ufsvfs_t *ufsvfsp,
504 daddr_t bno,
505 size_t nb,
506 struct fiolog *flp)
507 {
508 ml_odunit_t *ud, *ud2;
509 buf_t *bp;
510
511 /* LINTED: warning: logical expression always true: op "||" */
512 ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE);
513 ASSERT(nb >= ldl_minlogsize);
514
515 bp = UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, dbtob(LS_SECTORS));
516 bzero(bp->b_un.b_addr, bp->b_bcount);
517
518 ud = (void *)bp->b_un.b_addr;
519 ud->od_version = LUFS_VERSION_LATEST;
520 ud->od_maxtransfer = MIN(ufsvfsp->vfs_iotransz, ldl_maxtransfer);
521 if (ud->od_maxtransfer < ldl_mintransfer)
522 ud->od_maxtransfer = ldl_mintransfer;
523 ud->od_devbsize = DEV_BSIZE;
524
525 ud->od_requestsize = flp->nbytes_actual;
526 ud->od_statesize = dbtob(LS_SECTORS);
527 ud->od_logsize = nb - ud->od_statesize;
528
529 ud->od_statebno = INT32_C(0);
530
531 ud->od_head_ident = lufs_hd_genid(NULL);
532 ud->od_tail_ident = ud->od_head_ident;
533 ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
534
535 ud->od_bol_lof = dbtob(ud->od_statebno) + ud->od_statesize;
536 ud->od_eol_lof = ud->od_bol_lof + ud->od_logsize;
537 ud->od_head_lof = ud->od_bol_lof;
538 ud->od_tail_lof = ud->od_bol_lof;
539
540 ASSERT(lufs_initialize_debug(ud));
541
542 ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
543 bcopy(ud, ud2, sizeof (*ud));
544
545 UFS_BWRITE2(ufsvfsp, bp);
546 if (bp->b_flags & B_ERROR) {
547 brelse(bp);
548 return (EIO);
549 }
550 brelse(bp);
551
552 return (0);
553 }
554
555 /*
556 * Free log space
557 * Assumes the file system is write locked and is not logging
558 */
559 static int
lufs_free(struct ufsvfs * ufsvfsp)560 lufs_free(struct ufsvfs *ufsvfsp)
561 {
562 int error = 0, i, j;
563 buf_t *bp = NULL;
564 extent_t *ep;
565 extent_block_t *ebp;
566 struct fs *fs = ufsvfsp->vfs_fs;
567 daddr_t fno;
568 int32_t logbno;
569 long nfno;
570 inode_t *ip = NULL;
571 char clean;
572
573 /*
574 * Nothing to free
575 */
576 if (fs->fs_logbno == 0)
577 return (0);
578
579 /*
580 * Mark the file system as FSACTIVE and no log but honor the
581 * current value of fs_reclaim. The reclaim thread could have
582 * been active when lufs_disable() was called and if fs_reclaim
583 * is reset to zero here it could lead to lost inodes.
584 */
585 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
586 mutex_enter(&ufsvfsp->vfs_lock);
587 clean = fs->fs_clean;
588 logbno = fs->fs_logbno;
589 fs->fs_clean = FSACTIVE;
590 fs->fs_logbno = INT32_C(0);
591 ufs_sbwrite(ufsvfsp);
592 mutex_exit(&ufsvfsp->vfs_lock);
593 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
594 if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
595 error = EIO;
596 fs->fs_clean = clean;
597 fs->fs_logbno = logbno;
598 goto errout;
599 }
600
601 /*
602 * fetch the allocation block
603 * superblock -> one block of extents -> log data
604 */
605 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, logbno),
606 fs->fs_bsize);
607 if (bp->b_flags & B_ERROR) {
608 error = EIO;
609 goto errout;
610 }
611
612 /*
613 * Free up the allocated space (dummy inode needed for free())
614 */
615 ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
616 ebp = (void *)bp->b_un.b_addr;
617 for (i = 0, ep = &ebp->extents[0]; i < ebp->nextents; ++i, ++ep) {
618 fno = logbtofrag(fs, ep->pbno);
619 nfno = dbtofsb(fs, ep->nbno);
620 for (j = 0; j < nfno; j += fs->fs_frag, fno += fs->fs_frag)
621 free(ip, fno, fs->fs_bsize, 0);
622 }
623 free(ip, logbtofrag(fs, logbno), fs->fs_bsize, 0);
624 brelse(bp);
625 bp = NULL;
626
627 /*
628 * Push the metadata dirtied during the allocations
629 */
630 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
631 sbupdate(ufsvfsp->vfs_vfs);
632 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
633 bflush(ufsvfsp->vfs_dev);
634 error = bfinval(ufsvfsp->vfs_dev, 0);
635 if (error)
636 goto errout;
637
638 /*
639 * Free the dummy inode
640 */
641 ufs_free_inode(ip);
642
643 return (0);
644
645 errout:
646 /*
647 * Free up all resources
648 */
649 if (bp)
650 brelse(bp);
651 if (ip)
652 ufs_free_inode(ip);
653 return (error);
654 }
655
656 /*
657 * Allocate log space
658 * Assumes the file system is write locked and is not logging
659 */
660 static int
lufs_alloc(struct ufsvfs * ufsvfsp,struct fiolog * flp,size_t minb,cred_t * cr)661 lufs_alloc(struct ufsvfs *ufsvfsp, struct fiolog *flp, size_t minb, cred_t *cr)
662 {
663 int error = 0;
664 buf_t *bp = NULL;
665 extent_t *ep, *nep;
666 extent_block_t *ebp;
667 struct fs *fs = ufsvfsp->vfs_fs;
668 daddr_t fno; /* in frags */
669 daddr_t bno; /* in disk blocks */
670 int32_t logbno = INT32_C(0); /* will be fs_logbno */
671 struct inode *ip = NULL;
672 size_t nb = flp->nbytes_actual;
673 size_t tb = 0;
674
675 /*
676 * Mark the file system as FSACTIVE
677 */
678 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
679 mutex_enter(&ufsvfsp->vfs_lock);
680 fs->fs_clean = FSACTIVE;
681 ufs_sbwrite(ufsvfsp);
682 mutex_exit(&ufsvfsp->vfs_lock);
683 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
684
685 /*
686 * Allocate the allocation block (need dummy shadow inode;
687 * we use a shadow inode so the quota sub-system ignores
688 * the block allocations.)
689 * superblock -> one block of extents -> log data
690 */
691 ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO);
692 ip->i_mode = IFSHAD; /* make the dummy a shadow inode */
693 rw_enter(&ip->i_contents, RW_WRITER);
694 fno = contigpref(ufsvfsp, nb + fs->fs_bsize, minb);
695 error = alloc(ip, fno, fs->fs_bsize, &fno, cr);
696 if (error)
697 goto errout;
698 bno = fsbtodb(fs, fno);
699
700 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, fs->fs_bsize);
701 if (bp->b_flags & B_ERROR) {
702 error = EIO;
703 goto errout;
704 }
705
706 ebp = (void *)bp->b_un.b_addr;
707 ebp->type = LUFS_EXTENTS;
708 ebp->nextbno = UINT32_C(0);
709 ebp->nextents = UINT32_C(0);
710 ebp->chksum = INT32_C(0);
711 if (fs->fs_magic == FS_MAGIC)
712 logbno = bno;
713 else
714 logbno = dbtofsb(fs, bno);
715
716 /*
717 * Initialize the first extent
718 */
719 ep = &ebp->extents[0];
720 error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
721 if (error)
722 goto errout;
723 bno = fsbtodb(fs, fno);
724
725 ep->lbno = UINT32_C(0);
726 if (fs->fs_magic == FS_MAGIC)
727 ep->pbno = (uint32_t)bno;
728 else
729 ep->pbno = (uint32_t)fno;
730 ep->nbno = (uint32_t)fsbtodb(fs, fs->fs_frag);
731 ebp->nextents = UINT32_C(1);
732 tb = fs->fs_bsize;
733 nb -= fs->fs_bsize;
734
735 while (nb) {
736 error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr);
737 if (error) {
738 if (tb < minb)
739 goto errout;
740 error = 0;
741 break;
742 }
743 bno = fsbtodb(fs, fno);
744 if ((daddr_t)((logbtodb(fs, ep->pbno) + ep->nbno) == bno))
745 ep->nbno += (uint32_t)(fsbtodb(fs, fs->fs_frag));
746 else {
747 nep = ep + 1;
748 if ((caddr_t)(nep + 1) >
749 (bp->b_un.b_addr + fs->fs_bsize)) {
750 free(ip, fno, fs->fs_bsize, 0);
751 break;
752 }
753 nep->lbno = ep->lbno + ep->nbno;
754 if (fs->fs_magic == FS_MAGIC)
755 nep->pbno = (uint32_t)bno;
756 else
757 nep->pbno = (uint32_t)fno;
758 nep->nbno = (uint32_t)(fsbtodb(fs, fs->fs_frag));
759 ebp->nextents++;
760 ep = nep;
761 }
762 tb += fs->fs_bsize;
763 nb -= fs->fs_bsize;
764 }
765
766 if (tb < minb) { /* Failed to reach minimum log size */
767 error = ENOSPC;
768 goto errout;
769 }
770
771 ebp->nbytes = (uint32_t)tb;
772 setsum(&ebp->chksum, (int32_t *)bp->b_un.b_addr, fs->fs_bsize);
773 UFS_BWRITE2(ufsvfsp, bp);
774 if (bp->b_flags & B_ERROR) {
775 error = EIO;
776 goto errout;
777 }
778 /*
779 * Initialize the first two sectors of the log
780 */
781 error = lufs_initialize(ufsvfsp, logbtodb(fs, ebp->extents[0].pbno),
782 tb, flp);
783 if (error)
784 goto errout;
785
786 /*
787 * We are done initializing the allocation block and the log
788 */
789 brelse(bp);
790 bp = NULL;
791
792 /*
793 * Update the superblock and push the dirty metadata
794 */
795 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
796 sbupdate(ufsvfsp->vfs_vfs);
797 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
798 bflush(ufsvfsp->vfs_dev);
799 error = bfinval(ufsvfsp->vfs_dev, 1);
800 if (error)
801 goto errout;
802 if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) {
803 error = EIO;
804 goto errout;
805 }
806
807 /*
808 * Everything is safely on disk; update log space pointer in sb
809 */
810 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread;
811 mutex_enter(&ufsvfsp->vfs_lock);
812 fs->fs_logbno = (uint32_t)logbno;
813 ufs_sbwrite(ufsvfsp);
814 mutex_exit(&ufsvfsp->vfs_lock);
815 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1;
816
817 /*
818 * Free the dummy inode
819 */
820 rw_exit(&ip->i_contents);
821 ufs_free_inode(ip);
822
823 /* inform user of real log size */
824 flp->nbytes_actual = tb;
825 return (0);
826
827 errout:
828 /*
829 * Free all resources
830 */
831 if (bp)
832 brelse(bp);
833 if (logbno) {
834 fs->fs_logbno = logbno;
835 (void) lufs_free(ufsvfsp);
836 }
837 if (ip) {
838 rw_exit(&ip->i_contents);
839 ufs_free_inode(ip);
840 }
841 return (error);
842 }
843
844 /*
845 * Disable logging
846 */
847 int
lufs_disable(vnode_t * vp,struct fiolog * flp)848 lufs_disable(vnode_t *vp, struct fiolog *flp)
849 {
850 int error = 0;
851 inode_t *ip = VTOI(vp);
852 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
853 struct fs *fs = ufsvfsp->vfs_fs;
854 struct lockfs lf;
855 struct ulockfs *ulp;
856
857 flp->error = FIOLOG_ENONE;
858
859 /*
860 * Logging is already disabled; done
861 */
862 if (fs->fs_logbno == 0 || ufsvfsp->vfs_log == NULL)
863 return (0);
864
865 /*
866 * Readonly file system
867 */
868 if (fs->fs_ronly) {
869 flp->error = FIOLOG_EROFS;
870 return (0);
871 }
872
873 /*
874 * File system must be write locked to disable logging
875 */
876 error = ufs_fiolfss(vp, &lf);
877 if (error) {
878 return (error);
879 }
880 if (!LOCKFS_IS_ULOCK(&lf)) {
881 flp->error = FIOLOG_EULOCK;
882 return (0);
883 }
884 lf.lf_lock = LOCKFS_WLOCK;
885 lf.lf_flags = 0;
886 lf.lf_comment = NULL;
887 error = ufs_fiolfs(vp, &lf, 1);
888 if (error) {
889 flp->error = FIOLOG_EWLOCK;
890 return (0);
891 }
892
893 if (ufsvfsp->vfs_log == NULL || fs->fs_logbno == 0)
894 goto errout;
895
896 /*
897 * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT
898 */
899
900 /*
901 * Disable logging:
902 * Suspend the reclaim thread and force the delete thread to exit.
903 * When a nologging mount has completed there may still be
904 * work for reclaim to do so just suspend this thread until
905 * it's [deadlock-] safe for it to continue. The delete
906 * thread won't be needed as ufs_iinactive() calls
907 * ufs_delete() when logging is disabled.
908 * Freeze and drain reader ops.
909 * Commit any outstanding reader transactions (ufs_flush).
910 * Set the ``unmounted'' bit in the ufstrans struct.
911 * If debug, remove metadata from matamap.
912 * Disable matamap processing.
913 * NULL the trans ops table.
914 * Free all of the incore structs related to logging.
915 * Allow reader ops.
916 */
917 ufs_thread_suspend(&ufsvfsp->vfs_reclaim);
918 ufs_thread_exit(&ufsvfsp->vfs_delete);
919
920 vfs_lock_wait(ufsvfsp->vfs_vfs);
921 ulp = &ufsvfsp->vfs_ulockfs;
922 mutex_enter(&ulp->ul_lock);
923 atomic_inc_ulong(&ufs_quiesce_pend);
924 (void) ufs_quiesce(ulp);
925
926 (void) ufs_flush(ufsvfsp->vfs_vfs);
927
928 TRANS_MATA_UMOUNT(ufsvfsp);
929 ufsvfsp->vfs_domatamap = 0;
930
931 /*
932 * Free all of the incore structs
933 * Aquire the ufs_scan_lock before de-linking the mtm data
934 * structure so that we keep ufs_sync() and ufs_update() away
935 * when they execute the ufs_scan_inodes() run while we're in
936 * progress of enabling/disabling logging.
937 */
938 mutex_enter(&ufs_scan_lock);
939 (void) lufs_unsnarf(ufsvfsp);
940 mutex_exit(&ufs_scan_lock);
941
942 atomic_dec_ulong(&ufs_quiesce_pend);
943 mutex_exit(&ulp->ul_lock);
944 vfs_setmntopt(ufsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0);
945 vfs_unlock(ufsvfsp->vfs_vfs);
946
947 fs->fs_rolled = FS_ALL_ROLLED;
948 ufsvfsp->vfs_nolog_si = 0;
949
950 /*
951 * Free the log space and mark the superblock as FSACTIVE
952 */
953 (void) lufs_free(ufsvfsp);
954
955 /*
956 * Allow the reclaim thread to continue.
957 */
958 ufs_thread_continue(&ufsvfsp->vfs_reclaim);
959
960 /*
961 * Unlock the file system
962 */
963 lf.lf_lock = LOCKFS_ULOCK;
964 lf.lf_flags = 0;
965 error = ufs_fiolfs(vp, &lf, 1);
966 if (error)
967 flp->error = FIOLOG_ENOULOCK;
968
969 return (0);
970
971 errout:
972 lf.lf_lock = LOCKFS_ULOCK;
973 lf.lf_flags = 0;
974 (void) ufs_fiolfs(vp, &lf, 1);
975 return (error);
976 }
977
978 /*
979 * Enable logging
980 */
981 int
lufs_enable(struct vnode * vp,struct fiolog * flp,cred_t * cr)982 lufs_enable(struct vnode *vp, struct fiolog *flp, cred_t *cr)
983 {
984 int error;
985 int reclaim;
986 inode_t *ip = VTOI(vp);
987 ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
988 struct fs *fs;
989 ml_unit_t *ul;
990 struct lockfs lf;
991 struct ulockfs *ulp;
992 vfs_t *vfsp = ufsvfsp->vfs_vfs;
993 uint64_t tmp_nbytes_actual;
994 uint64_t cg_minlogsize;
995 uint32_t cgsize;
996 static int minlogsizewarn = 0;
997 static int maxlogsizewarn = 0;
998
999 /*
1000 * Check if logging is already enabled
1001 */
1002 if (ufsvfsp->vfs_log) {
1003 flp->error = FIOLOG_ETRANS;
1004 /* for root ensure logging option is set */
1005 vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0);
1006 return (0);
1007 }
1008 fs = ufsvfsp->vfs_fs;
1009
1010 /*
1011 * Come back here to recheck if we had to disable the log.
1012 */
1013 recheck:
1014 error = 0;
1015 reclaim = 0;
1016 flp->error = FIOLOG_ENONE;
1017
1018 /*
1019 * The size of the ufs log is determined using the following rules:
1020 *
1021 * 1) If no size is requested the log size is calculated as a
1022 * ratio of the total file system size. By default this is
1023 * 1MB of log per 1GB of file system. This calculation is then
1024 * capped at the log size specified by ldl_softlogcap.
1025 * 2) The log size requested may then be increased based on the
1026 * number of cylinder groups contained in the file system.
1027 * To prevent a hang the log has to be large enough to contain a
1028 * single transaction that alters every cylinder group in the file
1029 * system. This is calculated as cg_minlogsize.
1030 * 3) Finally a check is made that the log size requested is within
1031 * the limits of ldl_minlogsize and ldl_maxlogsize.
1032 */
1033
1034 /*
1035 * Adjust requested log size
1036 */
1037 flp->nbytes_actual = flp->nbytes_requested;
1038 if (flp->nbytes_actual == 0) {
1039 tmp_nbytes_actual =
1040 (((uint64_t)fs->fs_size) / ldl_divisor) << fs->fs_fshift;
1041 flp->nbytes_actual = (uint_t)MIN(tmp_nbytes_actual, INT_MAX);
1042 /*
1043 * The 1MB per 1GB log size allocation only applies up to
1044 * ldl_softlogcap size of log.
1045 */
1046 flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_softlogcap);
1047 }
1048
1049 cgsize = ldl_cgsizereq ? ldl_cgsizereq : LDL_CGSIZEREQ(fs);
1050
1051 /*
1052 * Determine the log size required based on the number of cylinder
1053 * groups in the file system. The log has to be at least this size
1054 * to prevent possible hangs due to log space exhaustion.
1055 */
1056 cg_minlogsize = cgsize * fs->fs_ncg;
1057
1058 /*
1059 * Ensure that the minimum log size isn't so small that it could lead
1060 * to a full log hang.
1061 */
1062 if (ldl_minlogsize < LDL_MINLOGSIZE) {
1063 ldl_minlogsize = LDL_MINLOGSIZE;
1064 if (!minlogsizewarn) {
1065 cmn_err(CE_WARN, "ldl_minlogsize too small, increasing "
1066 "to 0x%x", LDL_MINLOGSIZE);
1067 minlogsizewarn = 1;
1068 }
1069 }
1070
1071 /*
1072 * Ensure that the maximum log size isn't greater than INT_MAX as the
1073 * logical log offset fields would overflow.
1074 */
1075 if (ldl_maxlogsize > INT_MAX) {
1076 ldl_maxlogsize = INT_MAX;
1077 if (!maxlogsizewarn) {
1078 cmn_err(CE_WARN, "ldl_maxlogsize too large, reducing "
1079 "to 0x%x", INT_MAX);
1080 maxlogsizewarn = 1;
1081 }
1082 }
1083
1084 if (cg_minlogsize > ldl_maxlogsize) {
1085 cmn_err(CE_WARN,
1086 "%s: reducing calculated log size from 0x%x to "
1087 "ldl_maxlogsize (0x%x).", fs->fs_fsmnt, (int)cg_minlogsize,
1088 ldl_maxlogsize);
1089 }
1090
1091 cg_minlogsize = MAX(cg_minlogsize, ldl_minlogsize);
1092 cg_minlogsize = MIN(cg_minlogsize, ldl_maxlogsize);
1093
1094 flp->nbytes_actual = MAX(flp->nbytes_actual, cg_minlogsize);
1095 flp->nbytes_actual = MAX(flp->nbytes_actual, ldl_minlogsize);
1096 flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_maxlogsize);
1097 flp->nbytes_actual = blkroundup(fs, flp->nbytes_actual);
1098
1099 /*
1100 * logging is enabled and the log is the right size; done
1101 */
1102 ul = ufsvfsp->vfs_log;
1103 if (ul && fs->fs_logbno && (flp->nbytes_actual == ul->un_requestsize))
1104 return (0);
1105
1106 /*
1107 * Readonly file system
1108 */
1109 if (fs->fs_ronly) {
1110 flp->error = FIOLOG_EROFS;
1111 return (0);
1112 }
1113
1114 /*
1115 * File system must be write locked to enable logging
1116 */
1117 error = ufs_fiolfss(vp, &lf);
1118 if (error) {
1119 return (error);
1120 }
1121 if (!LOCKFS_IS_ULOCK(&lf)) {
1122 flp->error = FIOLOG_EULOCK;
1123 return (0);
1124 }
1125 lf.lf_lock = LOCKFS_WLOCK;
1126 lf.lf_flags = 0;
1127 lf.lf_comment = NULL;
1128 error = ufs_fiolfs(vp, &lf, 1);
1129 if (error) {
1130 flp->error = FIOLOG_EWLOCK;
1131 return (0);
1132 }
1133
1134 /*
1135 * Grab appropriate locks to synchronize with the rest
1136 * of the system
1137 */
1138 vfs_lock_wait(vfsp);
1139 ulp = &ufsvfsp->vfs_ulockfs;
1140 mutex_enter(&ulp->ul_lock);
1141
1142 /*
1143 * File system must be fairly consistent to enable logging
1144 */
1145 if (fs->fs_clean != FSLOG &&
1146 fs->fs_clean != FSACTIVE &&
1147 fs->fs_clean != FSSTABLE &&
1148 fs->fs_clean != FSCLEAN) {
1149 flp->error = FIOLOG_ECLEAN;
1150 goto unlockout;
1151 }
1152
1153 /*
1154 * A write-locked file system is only active if there are
1155 * open deleted files; so remember to set FS_RECLAIM later.
1156 */
1157 if (fs->fs_clean == FSACTIVE)
1158 reclaim = FS_RECLAIM;
1159
1160 /*
1161 * Logging is already enabled; must be changing the log's size
1162 */
1163 if (fs->fs_logbno && ufsvfsp->vfs_log) {
1164 /*
1165 * Before we can disable logging, we must give up our
1166 * lock. As a consequence of unlocking and disabling the
1167 * log, the fs structure may change. Because of this, when
1168 * disabling is complete, we will go back to recheck to
1169 * repeat all of the checks that we performed to get to
1170 * this point. Disabling sets fs->fs_logbno to 0, so this
1171 * will not put us into an infinite loop.
1172 */
1173 mutex_exit(&ulp->ul_lock);
1174 vfs_unlock(vfsp);
1175
1176 lf.lf_lock = LOCKFS_ULOCK;
1177 lf.lf_flags = 0;
1178 error = ufs_fiolfs(vp, &lf, 1);
1179 if (error) {
1180 flp->error = FIOLOG_ENOULOCK;
1181 return (0);
1182 }
1183 error = lufs_disable(vp, flp);
1184 if (error || (flp->error != FIOLOG_ENONE))
1185 return (0);
1186 goto recheck;
1187 }
1188
1189 error = lufs_alloc(ufsvfsp, flp, cg_minlogsize, cr);
1190 if (error)
1191 goto errout;
1192
1193 /*
1194 * Create all of the incore structs
1195 */
1196 error = lufs_snarf(ufsvfsp, fs, 0);
1197 if (error)
1198 goto errout;
1199
1200 /*
1201 * DON'T ``GOTO ERROUT'' PAST THIS POINT
1202 */
1203
1204 /*
1205 * Pretend we were just mounted with logging enabled
1206 * Get the ops vector
1207 * If debug, record metadata locations with log subsystem
1208 * Start the delete thread
1209 * Start the reclaim thread, if necessary
1210 */
1211 vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0);
1212
1213 TRANS_DOMATAMAP(ufsvfsp);
1214 TRANS_MATA_MOUNT(ufsvfsp);
1215 TRANS_MATA_SI(ufsvfsp, fs);
1216 ufs_thread_start(&ufsvfsp->vfs_delete, ufs_thread_delete, vfsp);
1217 if (fs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) {
1218 fs->fs_reclaim &= ~FS_RECLAIM;
1219 fs->fs_reclaim |= FS_RECLAIMING;
1220 ufs_thread_start(&ufsvfsp->vfs_reclaim,
1221 ufs_thread_reclaim, vfsp);
1222 } else
1223 fs->fs_reclaim |= reclaim;
1224
1225 mutex_exit(&ulp->ul_lock);
1226 vfs_unlock(vfsp);
1227
1228 /*
1229 * Unlock the file system
1230 */
1231 lf.lf_lock = LOCKFS_ULOCK;
1232 lf.lf_flags = 0;
1233 error = ufs_fiolfs(vp, &lf, 1);
1234 if (error) {
1235 flp->error = FIOLOG_ENOULOCK;
1236 return (0);
1237 }
1238
1239 /*
1240 * There's nothing in the log yet (we've just allocated it)
1241 * so directly write out the super block.
1242 * Note, we have to force this sb out to disk
1243 * (not just to the log) so that if we crash we know we are logging
1244 */
1245 mutex_enter(&ufsvfsp->vfs_lock);
1246 fs->fs_clean = FSLOG;
1247 fs->fs_rolled = FS_NEED_ROLL; /* Mark the fs as unrolled */
1248 UFS_BWRITE2(NULL, ufsvfsp->vfs_bufp);
1249 mutex_exit(&ufsvfsp->vfs_lock);
1250
1251 return (0);
1252
1253 errout:
1254 /*
1255 * Aquire the ufs_scan_lock before de-linking the mtm data
1256 * structure so that we keep ufs_sync() and ufs_update() away
1257 * when they execute the ufs_scan_inodes() run while we're in
1258 * progress of enabling/disabling logging.
1259 */
1260 mutex_enter(&ufs_scan_lock);
1261 (void) lufs_unsnarf(ufsvfsp);
1262 mutex_exit(&ufs_scan_lock);
1263
1264 (void) lufs_free(ufsvfsp);
1265 unlockout:
1266 mutex_exit(&ulp->ul_lock);
1267 vfs_unlock(vfsp);
1268
1269 lf.lf_lock = LOCKFS_ULOCK;
1270 lf.lf_flags = 0;
1271 (void) ufs_fiolfs(vp, &lf, 1);
1272 return (error);
1273 }
1274
1275 void
lufs_read_strategy(ml_unit_t * ul,buf_t * bp)1276 lufs_read_strategy(ml_unit_t *ul, buf_t *bp)
1277 {
1278 mt_map_t *logmap = ul->un_logmap;
1279 offset_t mof = ldbtob(bp->b_blkno);
1280 off_t nb = bp->b_bcount;
1281 mapentry_t *age;
1282 char *va;
1283 int (*saviodone)();
1284 int entire_range;
1285
1286 /*
1287 * get a linked list of overlapping deltas
1288 * returns with &mtm->mtm_rwlock held
1289 */
1290 entire_range = logmap_list_get(logmap, mof, nb, &age);
1291
1292 /*
1293 * no overlapping deltas were found; read master
1294 */
1295 if (age == NULL) {
1296 rw_exit(&logmap->mtm_rwlock);
1297 if (ul->un_flags & LDL_ERROR) {
1298 bp->b_flags |= B_ERROR;
1299 bp->b_error = EIO;
1300 biodone(bp);
1301 } else {
1302 ul->un_ufsvfs->vfs_iotstamp = ddi_get_lbolt();
1303 logstats.ls_lreads.value.ui64++;
1304 (void) bdev_strategy(bp);
1305 lwp_stat_update(LWP_STAT_INBLK, 1);
1306 }
1307 return;
1308 }
1309
1310 va = bp_mapin_common(bp, VM_SLEEP);
1311 /*
1312 * if necessary, sync read the data from master
1313 * errors are returned in bp
1314 */
1315 if (!entire_range) {
1316 saviodone = bp->b_iodone;
1317 bp->b_iodone = trans_not_done;
1318 logstats.ls_mreads.value.ui64++;
1319 (void) bdev_strategy(bp);
1320 lwp_stat_update(LWP_STAT_INBLK, 1);
1321 if (trans_not_wait(bp))
1322 ldl_seterror(ul, "Error reading master");
1323 bp->b_iodone = saviodone;
1324 }
1325
1326 /*
1327 * sync read the data from the log
1328 * errors are returned inline
1329 */
1330 if (ldl_read(ul, va, mof, nb, age)) {
1331 bp->b_flags |= B_ERROR;
1332 bp->b_error = EIO;
1333 }
1334
1335 /*
1336 * unlist the deltas
1337 */
1338 logmap_list_put(logmap, age);
1339
1340 /*
1341 * all done
1342 */
1343 if (ul->un_flags & LDL_ERROR) {
1344 bp->b_flags |= B_ERROR;
1345 bp->b_error = EIO;
1346 }
1347 biodone(bp);
1348 }
1349
1350 void
lufs_write_strategy(ml_unit_t * ul,buf_t * bp)1351 lufs_write_strategy(ml_unit_t *ul, buf_t *bp)
1352 {
1353 offset_t mof = ldbtob(bp->b_blkno);
1354 off_t nb = bp->b_bcount;
1355 char *va;
1356 mapentry_t *me;
1357
1358 ASSERT((nb & DEV_BMASK) == 0);
1359 ul->un_logmap->mtm_ref = 1;
1360
1361 /*
1362 * if there are deltas, move into log
1363 */
1364 me = deltamap_remove(ul->un_deltamap, mof, nb);
1365 if (me) {
1366
1367 va = bp_mapin_common(bp, VM_SLEEP);
1368
1369 ASSERT(((ul->un_debug & MT_WRITE_CHECK) == 0) ||
1370 (ul->un_matamap == NULL)||
1371 matamap_within(ul->un_matamap, mof, nb));
1372
1373 /*
1374 * move to logmap
1375 */
1376 if (ufs_crb_enable) {
1377 logmap_add_buf(ul, va, mof, me,
1378 bp->b_un.b_addr, nb);
1379 } else {
1380 logmap_add(ul, va, mof, me);
1381 }
1382
1383 if (ul->un_flags & LDL_ERROR) {
1384 bp->b_flags |= B_ERROR;
1385 bp->b_error = EIO;
1386 }
1387 biodone(bp);
1388 return;
1389 }
1390 if (ul->un_flags & LDL_ERROR) {
1391 bp->b_flags |= B_ERROR;
1392 bp->b_error = EIO;
1393 biodone(bp);
1394 return;
1395 }
1396
1397 /*
1398 * Check that we are not updating metadata, or if so then via B_PHYS.
1399 */
1400 ASSERT((ul->un_matamap == NULL) ||
1401 !(matamap_overlap(ul->un_matamap, mof, nb) &&
1402 ((bp->b_flags & B_PHYS) == 0)));
1403
1404 ul->un_ufsvfs->vfs_iotstamp = ddi_get_lbolt();
1405 logstats.ls_lwrites.value.ui64++;
1406
1407 /* If snapshots are enabled, write through the snapshot driver */
1408 if (ul->un_ufsvfs->vfs_snapshot)
1409 fssnap_strategy(&ul->un_ufsvfs->vfs_snapshot, bp);
1410 else
1411 (void) bdev_strategy(bp);
1412
1413 lwp_stat_update(LWP_STAT_OUBLK, 1);
1414 }
1415
1416 void
lufs_strategy(ml_unit_t * ul,buf_t * bp)1417 lufs_strategy(ml_unit_t *ul, buf_t *bp)
1418 {
1419 if (bp->b_flags & B_READ)
1420 lufs_read_strategy(ul, bp);
1421 else
1422 lufs_write_strategy(ul, bp);
1423 }
1424
1425 /* ARGSUSED */
1426 static int
delta_stats_update(kstat_t * ksp,int rw)1427 delta_stats_update(kstat_t *ksp, int rw)
1428 {
1429 if (rw == KSTAT_WRITE) {
1430 delta_stats[DT_SB] = dkstats.ds_superblock_deltas.value.ui64;
1431 delta_stats[DT_CG] = dkstats.ds_bitmap_deltas.value.ui64;
1432 delta_stats[DT_SI] = dkstats.ds_suminfo_deltas.value.ui64;
1433 delta_stats[DT_AB] = dkstats.ds_allocblk_deltas.value.ui64;
1434 delta_stats[DT_ABZERO] = dkstats.ds_ab0_deltas.value.ui64;
1435 delta_stats[DT_DIR] = dkstats.ds_dir_deltas.value.ui64;
1436 delta_stats[DT_INODE] = dkstats.ds_inode_deltas.value.ui64;
1437 delta_stats[DT_FBI] = dkstats.ds_fbiwrite_deltas.value.ui64;
1438 delta_stats[DT_QR] = dkstats.ds_quota_deltas.value.ui64;
1439 delta_stats[DT_SHAD] = dkstats.ds_shadow_deltas.value.ui64;
1440
1441 roll_stats[DT_SB] = dkstats.ds_superblock_rolled.value.ui64;
1442 roll_stats[DT_CG] = dkstats.ds_bitmap_rolled.value.ui64;
1443 roll_stats[DT_SI] = dkstats.ds_suminfo_rolled.value.ui64;
1444 roll_stats[DT_AB] = dkstats.ds_allocblk_rolled.value.ui64;
1445 roll_stats[DT_ABZERO] = dkstats.ds_ab0_rolled.value.ui64;
1446 roll_stats[DT_DIR] = dkstats.ds_dir_rolled.value.ui64;
1447 roll_stats[DT_INODE] = dkstats.ds_inode_rolled.value.ui64;
1448 roll_stats[DT_FBI] = dkstats.ds_fbiwrite_rolled.value.ui64;
1449 roll_stats[DT_QR] = dkstats.ds_quota_rolled.value.ui64;
1450 roll_stats[DT_SHAD] = dkstats.ds_shadow_rolled.value.ui64;
1451 } else {
1452 dkstats.ds_superblock_deltas.value.ui64 = delta_stats[DT_SB];
1453 dkstats.ds_bitmap_deltas.value.ui64 = delta_stats[DT_CG];
1454 dkstats.ds_suminfo_deltas.value.ui64 = delta_stats[DT_SI];
1455 dkstats.ds_allocblk_deltas.value.ui64 = delta_stats[DT_AB];
1456 dkstats.ds_ab0_deltas.value.ui64 = delta_stats[DT_ABZERO];
1457 dkstats.ds_dir_deltas.value.ui64 = delta_stats[DT_DIR];
1458 dkstats.ds_inode_deltas.value.ui64 = delta_stats[DT_INODE];
1459 dkstats.ds_fbiwrite_deltas.value.ui64 = delta_stats[DT_FBI];
1460 dkstats.ds_quota_deltas.value.ui64 = delta_stats[DT_QR];
1461 dkstats.ds_shadow_deltas.value.ui64 = delta_stats[DT_SHAD];
1462
1463 dkstats.ds_superblock_rolled.value.ui64 = roll_stats[DT_SB];
1464 dkstats.ds_bitmap_rolled.value.ui64 = roll_stats[DT_CG];
1465 dkstats.ds_suminfo_rolled.value.ui64 = roll_stats[DT_SI];
1466 dkstats.ds_allocblk_rolled.value.ui64 = roll_stats[DT_AB];
1467 dkstats.ds_ab0_rolled.value.ui64 = roll_stats[DT_ABZERO];
1468 dkstats.ds_dir_rolled.value.ui64 = roll_stats[DT_DIR];
1469 dkstats.ds_inode_rolled.value.ui64 = roll_stats[DT_INODE];
1470 dkstats.ds_fbiwrite_rolled.value.ui64 = roll_stats[DT_FBI];
1471 dkstats.ds_quota_rolled.value.ui64 = roll_stats[DT_QR];
1472 dkstats.ds_shadow_rolled.value.ui64 = roll_stats[DT_SHAD];
1473 }
1474 return (0);
1475 }
1476
1477 extern size_t ufs_crb_limit;
1478 extern int ufs_max_crb_divisor;
1479
1480 void
lufs_init(void)1481 lufs_init(void)
1482 {
1483 kstat_t *ksp;
1484
1485 /* Create kmem caches */
1486 lufs_sv = kmem_cache_create("lufs_save", sizeof (lufs_save_t), 0,
1487 NULL, NULL, NULL, NULL, NULL, 0);
1488 lufs_bp = kmem_cache_create("lufs_bufs", sizeof (lufs_buf_t), 0,
1489 NULL, NULL, NULL, NULL, NULL, 0);
1490
1491 mutex_init(&log_mutex, NULL, MUTEX_DEFAULT, NULL);
1492
1493 _init_top();
1494
1495 if (bio_lufs_strategy == NULL)
1496 bio_lufs_strategy = (void (*) (void *, buf_t *)) lufs_strategy;
1497
1498 /*
1499 * Initialise general logging and delta kstats
1500 */
1501 ksp = kstat_create("ufs_log", 0, "logstats", "ufs", KSTAT_TYPE_NAMED,
1502 sizeof (logstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1503 if (ksp) {
1504 ksp->ks_data = (void *) &logstats;
1505 kstat_install(ksp);
1506 }
1507
1508 ksp = kstat_create("ufs_log", 0, "deltastats", "ufs", KSTAT_TYPE_NAMED,
1509 sizeof (dkstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
1510 if (ksp) {
1511 ksp->ks_data = (void *) &dkstats;
1512 ksp->ks_update = delta_stats_update;
1513 kstat_install(ksp);
1514 }
1515
1516 /* Initialize generation of logging ids */
1517 lufs_genid_init();
1518
1519 /*
1520 * Set up the maximum amount of kmem that the crbs (system wide)
1521 * can use.
1522 */
1523 ufs_crb_limit = kmem_maxavail() / ufs_max_crb_divisor;
1524 }
1525