1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2012 Milan Jurik. All rights reserved.
25 */
26
27 #include <sys/systm.h>
28 #include <sys/types.h>
29 #include <sys/vnode.h>
30 #include <sys/errno.h>
31 #include <sys/sysmacros.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/conf.h>
35 #include <sys/proc.h>
36 #include <sys/cmn_err.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/inttypes.h>
41 #include <sys/atomic.h>
42 #include <sys/tuneable.h>
43
44 /*
45 * externs
46 */
47 extern pri_t minclsyspri;
48 extern struct kmem_cache *lufs_bp;
49 extern int ufs_trans_push_quota(ufsvfs_t *, delta_t, struct dquot *);
50
51 /*
52 * globals
53 */
54 kmem_cache_t *mapentry_cache;
55
56 /*
57 * logmap tuning constants
58 */
59 long logmap_maxnme_commit = 2048;
60 long logmap_maxnme_async = 4096;
61 long logmap_maxnme_sync = 6144;
62 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */
63
64
65 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */
66 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */
67 size_t ufs_crb_limit; /* max allowable size for crbs */
68 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */
69 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */
70 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
71 void handle_dquot(mapentry_t *);
72
73 /*
74 * GENERIC MAP ROUTINES
75 */
76
77 #define CRB_FREE(crb, me) \
78 kmem_free(crb->c_buf, crb->c_nb); \
79 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
80 kmem_free(crb, sizeof (crb_t)); \
81 (me)->me_crb = NULL;
82
83 #define CRB_RELE(me) { \
84 crb_t *crb = (me)->me_crb; \
85 if (crb && (--crb->c_refcnt == 0)) { \
86 CRB_FREE(crb, me) \
87 } \
88 }
89
90 /*
91 * Check that the old delta has an argument and a push function of
92 * ufs_trans_push_quota(), then check that the old and new deltas differ.
93 * If so we clean up with handle_dquot() before replacing the old delta.
94 */
95 #define HANDLE_DQUOT(me, melist) { \
96 if ((me->me_arg) && \
97 (me->me_func == ufs_trans_push_quota)) { \
98 if (!((me->me_dt == melist->me_dt) && \
99 (me->me_arg == melist->me_arg) && \
100 (me->me_func == melist->me_func))) { \
101 handle_dquot(me); \
102 } \
103 } \
104 }
105
106 /*
107 * free up all the mapentries for a map
108 */
109 void
map_free_entries(mt_map_t * mtm)110 map_free_entries(mt_map_t *mtm)
111 {
112 int i;
113 mapentry_t *me;
114
115 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
116 me->me_next->me_prev = me->me_prev;
117 me->me_prev->me_next = me->me_next;
118 CRB_RELE(me);
119 kmem_cache_free(mapentry_cache, me);
120 }
121 for (i = 0; i < mtm->mtm_nhash; i++)
122 mtm->mtm_hash[i] = NULL;
123 mtm->mtm_nme = 0;
124 mtm->mtm_nmet = 0;
125 }
126
127 /*
128 * done with map; free if necessary
129 */
130 mt_map_t *
map_put(mt_map_t * mtm)131 map_put(mt_map_t *mtm)
132 {
133 /*
134 * free up the map's memory
135 */
136 map_free_entries(mtm);
137 ASSERT(map_put_debug(mtm));
138 kmem_free(mtm->mtm_hash,
139 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
140 mutex_destroy(&mtm->mtm_mutex);
141 mutex_destroy(&mtm->mtm_scan_mutex);
142 cv_destroy(&mtm->mtm_to_roll_cv);
143 cv_destroy(&mtm->mtm_from_roll_cv);
144 rw_destroy(&mtm->mtm_rwlock);
145 mutex_destroy(&mtm->mtm_lock);
146 cv_destroy(&mtm->mtm_cv_commit);
147 cv_destroy(&mtm->mtm_cv_next);
148 cv_destroy(&mtm->mtm_cv_eot);
149 cv_destroy(&mtm->mtm_cv);
150 kmem_free(mtm, sizeof (mt_map_t));
151 return (NULL);
152 }
153 /*
154 * Allocate a map;
155 */
156 mt_map_t *
map_get(ml_unit_t * ul,enum maptypes maptype,int nh)157 map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
158 {
159 mt_map_t *mtm;
160
161 /*
162 * assume the map is not here and allocate the necessary structs
163 */
164 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
165 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
166 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
167 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
168 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
169 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
170 mtm->mtm_next = (mapentry_t *)mtm;
171 mtm->mtm_prev = (mapentry_t *)mtm;
172 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
173 KM_SLEEP);
174 mtm->mtm_nhash = nh;
175 mtm->mtm_debug = ul->un_debug;
176 mtm->mtm_type = maptype;
177
178 mtm->mtm_cfrags = 0;
179 mtm->mtm_cfragmax = logmap_maxcfrag_commit;
180
181 /*
182 * for scan test
183 */
184 mtm->mtm_ul = ul;
185
186 /*
187 * Initialize locks
188 */
189 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
190 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
191 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
192 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
193 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
194 ASSERT(map_get_debug(ul, mtm));
195
196 return (mtm);
197 }
198
199 /*
200 * DELTAMAP ROUTINES
201 */
202 /*
203 * deltamap tuning constants
204 */
205 long deltamap_maxnme = 1024; /* global so it can be set */
206
207 int
deltamap_need_commit(mt_map_t * mtm)208 deltamap_need_commit(mt_map_t *mtm)
209 {
210 return (mtm->mtm_nme > deltamap_maxnme);
211 }
212
213 /*
214 * put a delta into a deltamap; may sleep on memory
215 */
216 void
deltamap_add(mt_map_t * mtm,offset_t mof,off_t nb,delta_t dtyp,int (* func)(),ulong_t arg,threadtrans_t * tp)217 deltamap_add(
218 mt_map_t *mtm,
219 offset_t mof,
220 off_t nb,
221 delta_t dtyp,
222 int (*func)(),
223 ulong_t arg,
224 threadtrans_t *tp)
225 {
226 int32_t hnb;
227 mapentry_t *me;
228 mapentry_t **mep;
229
230 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
231 map_check_linkage(mtm));
232
233 mutex_enter(&mtm->mtm_mutex);
234
235 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
236 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
237 if (hnb > nb)
238 hnb = nb;
239 /*
240 * Search for dup entry. We need to ensure that we don't
241 * replace a map entry which carries quota information
242 * with a map entry which doesn't. In that case we lose
243 * reference the the dquot structure which will not be
244 * cleaned up by the push function me->me_func as this will
245 * never be called.
246 * The stray dquot would be found later by invalidatedq()
247 * causing a panic when the filesystem is unmounted.
248 */
249 mep = MAP_HASH(mof, mtm);
250 for (me = *mep; me; me = me->me_hash) {
251 if (DATAwithinME(mof, hnb, me)) {
252 /*
253 * Don't remove quota entries which have
254 * incremented the ref count (those with a
255 * ufs_trans_push_quota push function).
256 * Let logmap_add[_buf] clean them up.
257 */
258 if (me->me_func == ufs_trans_push_quota) {
259 continue;
260 }
261 break;
262 }
263 ASSERT((dtyp == DT_CANCEL) ||
264 (!DATAoverlapME(mof, hnb, me)) ||
265 MEwithinDATA(me, mof, hnb));
266 }
267
268 if (me) {
269 /* already in map */
270 continue;
271 }
272
273 /*
274 * Add up all the delta map deltas so we can compute
275 * an upper bound on the log size used.
276 * Note, some deltas get removed from the deltamap
277 * before the deltamap_push by lufs_write_strategy
278 * and so multiple deltas to the same mof offset
279 * don't get cancelled here but in the logmap.
280 * Thus we can't easily get a accurate count of
281 * the log space used - only an upper bound.
282 */
283 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
284 ASSERT(dtyp != DT_CANCEL);
285 if (dtyp == DT_ABZERO) {
286 tp->deltas_size += sizeof (struct delta);
287 } else {
288 tp->deltas_size +=
289 (hnb + sizeof (struct delta));
290 }
291 }
292
293 delta_stats[dtyp]++;
294
295 /*
296 * get a mapentry
297 * May need to drop & re-grab the mtm_mutex
298 * and then recheck for a duplicate
299 */
300 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
301 if (me == NULL) {
302 mutex_exit(&mtm->mtm_mutex);
303 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
304 mutex_enter(&mtm->mtm_mutex);
305 }
306 bzero(me, sizeof (mapentry_t));
307
308 /*
309 * initialize and put in deltamap
310 */
311 me->me_mof = mof;
312 me->me_nb = hnb;
313 me->me_func = func;
314 me->me_arg = arg;
315 me->me_dt = dtyp;
316 me->me_flags = ME_HASH;
317 me->me_tid = mtm->mtm_tid;
318
319 me->me_hash = *mep;
320 *mep = me;
321 me->me_next = (mapentry_t *)mtm;
322 me->me_prev = mtm->mtm_prev;
323 mtm->mtm_prev->me_next = me;
324 mtm->mtm_prev = me;
325 mtm->mtm_nme++;
326 }
327 mutex_exit(&mtm->mtm_mutex);
328
329 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
330 map_check_linkage(mtm));
331 }
332
333 /*
334 * remove deltas within (mof, nb) and return as linked list
335 */
336 mapentry_t *
deltamap_remove(mt_map_t * mtm,offset_t mof,off_t nb)337 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
338 {
339 off_t hnb;
340 mapentry_t *me;
341 mapentry_t **mep;
342 mapentry_t *mer;
343
344 if (mtm == NULL)
345 return (NULL);
346
347 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
348 map_check_linkage(mtm));
349
350 mutex_enter(&mtm->mtm_mutex);
351 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
352 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
353 if (hnb > nb)
354 hnb = nb;
355 /*
356 * remove entries from hash and return as a aged linked list
357 */
358 mep = MAP_HASH(mof, mtm);
359 while ((me = *mep) != 0) {
360 if (MEwithinDATA(me, mof, hnb)) {
361 *mep = me->me_hash;
362 me->me_next->me_prev = me->me_prev;
363 me->me_prev->me_next = me->me_next;
364 me->me_hash = mer;
365 mer = me;
366 me->me_flags |= ME_LIST;
367 me->me_flags &= ~ME_HASH;
368 mtm->mtm_nme--;
369 } else
370 mep = &me->me_hash;
371 }
372 }
373 mutex_exit(&mtm->mtm_mutex);
374
375 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
376 map_check_linkage(mtm));
377
378 return (mer);
379 }
380
381 /*
382 * delete entries within (mof, nb)
383 */
384 void
deltamap_del(mt_map_t * mtm,offset_t mof,off_t nb)385 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
386 {
387 mapentry_t *me;
388 mapentry_t *menext;
389
390 menext = deltamap_remove(mtm, mof, nb);
391 while ((me = menext) != 0) {
392 menext = me->me_hash;
393 kmem_cache_free(mapentry_cache, me);
394 }
395 }
396
397 /*
398 * Call the indicated function to cause deltas to move to the logmap.
399 * top_end_sync() is the only caller of this function and
400 * it has waited for the completion of all threads, so there can
401 * be no other activity in the deltamap. Therefore we don't need to
402 * hold the deltamap lock.
403 */
404 void
deltamap_push(ml_unit_t * ul)405 deltamap_push(ml_unit_t *ul)
406 {
407 delta_t dtyp;
408 int (*func)();
409 ulong_t arg;
410 mapentry_t *me;
411 offset_t mof;
412 off_t nb;
413 mt_map_t *mtm = ul->un_deltamap;
414
415 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
416 map_check_linkage(mtm));
417
418 /*
419 * for every entry in the deltamap
420 */
421 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
422 ASSERT(me->me_func);
423 func = me->me_func;
424 dtyp = me->me_dt;
425 arg = me->me_arg;
426 mof = me->me_mof;
427 nb = me->me_nb;
428 if ((ul->un_flags & LDL_ERROR) ||
429 (*func)(ul->un_ufsvfs, dtyp, arg))
430 deltamap_del(mtm, mof, nb);
431 }
432
433 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
434 map_check_linkage(mtm));
435 }
436
437 /*
438 * LOGMAP ROUTINES
439 */
440
441 int
logmap_need_commit(mt_map_t * mtm)442 logmap_need_commit(mt_map_t *mtm)
443 {
444 return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
445 (mtm->mtm_cfrags >= mtm->mtm_cfragmax));
446 }
447
448 int
logmap_need_roll_async(mt_map_t * mtm)449 logmap_need_roll_async(mt_map_t *mtm)
450 {
451 return (mtm->mtm_nme > logmap_maxnme_async);
452 }
453
454 int
logmap_need_roll_sync(mt_map_t * mtm)455 logmap_need_roll_sync(mt_map_t *mtm)
456 {
457 return (mtm->mtm_nme > logmap_maxnme_sync);
458 }
459
460 void
logmap_start_roll(ml_unit_t * ul)461 logmap_start_roll(ml_unit_t *ul)
462 {
463 mt_map_t *logmap = ul->un_logmap;
464
465 logmap_settail(logmap, ul);
466 ASSERT(!(ul->un_flags & LDL_NOROLL));
467 mutex_enter(&logmap->mtm_mutex);
468 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
469 logmap->mtm_flags |= MTM_ROLL_RUNNING;
470 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
471 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
472 TS_RUN, minclsyspri);
473 }
474 mutex_exit(&logmap->mtm_mutex);
475 }
476
477 void
logmap_kill_roll(ml_unit_t * ul)478 logmap_kill_roll(ml_unit_t *ul)
479 {
480 mt_map_t *mtm = ul->un_logmap;
481
482 if (mtm == NULL)
483 return;
484
485 mutex_enter(&mtm->mtm_mutex);
486
487 while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
488 mtm->mtm_flags |= MTM_ROLL_EXIT;
489 cv_signal(&mtm->mtm_to_roll_cv);
490 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
491 }
492 mutex_exit(&mtm->mtm_mutex);
493 }
494
495 /*
496 * kick the roll thread if it's not doing anything
497 */
498 void
logmap_forceroll_nowait(mt_map_t * logmap)499 logmap_forceroll_nowait(mt_map_t *logmap)
500 {
501 /*
502 * Don't need to lock mtm_mutex to read mtm_flags here as we
503 * don't care in the rare case when we get a transitional value
504 * of mtm_flags. Just by signalling the thread it will wakeup
505 * and notice it has too many logmap entries.
506 */
507 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
508 if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
509 cv_signal(&logmap->mtm_to_roll_cv);
510 }
511 }
512
513 /*
514 * kick the roll thread and wait for it to finish a cycle
515 */
516 void
logmap_forceroll(mt_map_t * mtm)517 logmap_forceroll(mt_map_t *mtm)
518 {
519 mutex_enter(&mtm->mtm_mutex);
520 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
521 mtm->mtm_flags |= MTM_FORCE_ROLL;
522 cv_signal(&mtm->mtm_to_roll_cv);
523 }
524 do {
525 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
526 mtm->mtm_flags &= ~MTM_FORCE_ROLL;
527 goto out;
528 }
529 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
530 } while (mtm->mtm_flags & MTM_FORCE_ROLL);
531 out:
532 mutex_exit(&mtm->mtm_mutex);
533 }
534
535 /*
536 * remove rolled deltas within (mof, nb) and free them
537 */
538 void
logmap_remove_roll(mt_map_t * mtm,offset_t mof,off_t nb)539 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
540 {
541 int dolock = 0;
542 off_t hnb;
543 mapentry_t *me;
544 mapentry_t **mep;
545 offset_t savmof = mof;
546 off_t savnb = nb;
547
548 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
549 map_check_linkage(mtm));
550
551 again:
552 if (dolock)
553 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
554 mutex_enter(&mtm->mtm_mutex);
555 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
556 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
557 if (hnb > nb)
558 hnb = nb;
559 /*
560 * remove and free the rolled entries
561 */
562 mep = MAP_HASH(mof, mtm);
563 while ((me = *mep) != 0) {
564 if ((me->me_flags & ME_ROLL) &&
565 (MEwithinDATA(me, mof, hnb))) {
566 if (me->me_flags & ME_AGE) {
567 ASSERT(dolock == 0);
568 dolock = 1;
569 mutex_exit(&mtm->mtm_mutex);
570 mof = savmof;
571 nb = savnb;
572 goto again;
573 }
574 *mep = me->me_hash;
575 me->me_next->me_prev = me->me_prev;
576 me->me_prev->me_next = me->me_next;
577 me->me_flags &= ~(ME_HASH|ME_ROLL);
578 ASSERT(!(me->me_flags & ME_USER));
579 mtm->mtm_nme--;
580 /*
581 * cancelled entries are handled by someone else
582 */
583 if ((me->me_flags & ME_CANCEL) == 0) {
584 roll_stats[me->me_dt]++;
585 CRB_RELE(me);
586 kmem_cache_free(mapentry_cache, me);
587 }
588 } else
589 mep = &me->me_hash;
590 }
591 }
592 mutex_exit(&mtm->mtm_mutex);
593
594 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
595 map_check_linkage(mtm));
596
597 if (dolock)
598 rw_exit(&mtm->mtm_rwlock);
599 }
600
601 /*
602 * Find the disk offset of the next delta to roll.
603 * Returns 0: no more deltas to roll or a transaction is being committed
604 * 1: a delta to roll has been found and *mofp points
605 * to the master file disk offset
606 */
607 int
logmap_next_roll(mt_map_t * logmap,offset_t * mofp)608 logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
609 {
610 mapentry_t *me;
611
612 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
613 map_check_linkage(logmap));
614
615 mutex_enter(&logmap->mtm_mutex);
616 for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
617 me = me->me_next) {
618 /* already rolled */
619 if (me->me_flags & ME_ROLL) {
620 continue;
621 }
622
623 /* part of currently busy transaction; stop */
624 if (me->me_tid == logmap->mtm_tid) {
625 break;
626 }
627
628 /* part of commit-in-progress transaction; stop */
629 if (me->me_tid == logmap->mtm_committid) {
630 break;
631 }
632
633 /*
634 * We shouldn't see a DT_CANCEL mapentry whose
635 * tid != mtm_committid, or != mtm_tid since
636 * these are removed at the end of each committed
637 * transaction.
638 */
639 ASSERT(!(me->me_dt == DT_CANCEL));
640
641 *mofp = me->me_mof;
642 mutex_exit(&logmap->mtm_mutex);
643 return (1);
644 }
645 mutex_exit(&logmap->mtm_mutex);
646 return (0);
647 }
648
649 /*
650 * put mapentry on sorted age list
651 */
652 static void
logmap_list_age(mapentry_t ** age,mapentry_t * meadd)653 logmap_list_age(mapentry_t **age, mapentry_t *meadd)
654 {
655 mapentry_t *me;
656
657 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));
658
659 for (me = *age; me; age = &me->me_agenext, me = *age) {
660 if (me->me_age > meadd->me_age)
661 break;
662 }
663 meadd->me_agenext = me;
664 meadd->me_flags |= ME_AGE;
665 *age = meadd;
666 }
667
668 /*
669 * get a list of deltas within <mof, mof+nb>
670 * returns with mtm_rwlock held
671 * return value says whether the entire mof range is covered by deltas
672 */
673 int
logmap_list_get(mt_map_t * mtm,offset_t mof,off_t nb,mapentry_t ** age)674 logmap_list_get(
675 mt_map_t *mtm,
676 offset_t mof,
677 off_t nb,
678 mapentry_t **age)
679 {
680 off_t hnb;
681 mapentry_t *me;
682 mapentry_t **mep;
683 int rwtype = RW_READER;
684 offset_t savmof = mof;
685 off_t savnb = nb;
686 int entire = 0;
687 crb_t *crb;
688
689 mtm->mtm_ref = 1;
690 again:
691
692 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
693 map_check_linkage(mtm));
694
695 rw_enter(&mtm->mtm_rwlock, rwtype);
696 *age = NULL;
697 mutex_enter(&mtm->mtm_mutex);
698 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
699 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
700 if (hnb > nb)
701 hnb = nb;
702 /*
703 * find overlapping entries
704 */
705 mep = MAP_HASH(mof, mtm);
706 for (me = *mep; me; me = me->me_hash) {
707 if (me->me_dt == DT_CANCEL)
708 continue;
709 if (!DATAoverlapME(mof, hnb, me))
710 continue;
711 /*
712 * check if map entry is in use
713 * (about to be rolled).
714 */
715 if (me->me_flags & ME_AGE) {
716 /*
717 * reset the age bit in the list,
718 * upgrade the lock, and try again
719 */
720 for (me = *age; me; me = *age) {
721 *age = me->me_agenext;
722 me->me_flags &= ~ME_AGE;
723 }
724 mutex_exit(&mtm->mtm_mutex);
725 rw_exit(&mtm->mtm_rwlock);
726 rwtype = RW_WRITER;
727 mof = savmof;
728 nb = savnb;
729 entire = 0;
730 goto again;
731 } else {
732 /* add mapentry to age ordered list */
733 logmap_list_age(age, me);
734 crb = me->me_crb;
735 if (crb) {
736 if (DATAwithinCRB(savmof, savnb, crb)) {
737 entire = 1;
738 }
739 } else {
740 if (DATAwithinME(savmof, savnb, me)) {
741 entire = 1;
742 }
743 }
744 }
745 }
746 }
747 mutex_exit(&mtm->mtm_mutex);
748
749 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
750 return (entire);
751 }
752
753 /*
754 * Get a list of deltas for rolling - returns sucess or failure.
755 * Also return the cached roll buffer if all deltas point to it.
756 */
757 int
logmap_list_get_roll(mt_map_t * logmap,offset_t mof,rollbuf_t * rbp)758 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
759 {
760 mapentry_t *me, **mep, *age = NULL;
761 crb_t *crb = NULL;
762
763 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
764 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
765 map_check_linkage(logmap));
766 ASSERT((mof & MAPBLOCKOFF) == 0);
767
768 rbp->rb_crb = NULL;
769
770 /*
771 * find overlapping entries
772 */
773 mutex_enter(&logmap->mtm_mutex);
774 mep = MAP_HASH(mof, logmap);
775 for (me = *mep; me; me = me->me_hash) {
776 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
777 continue;
778 if (me->me_tid == logmap->mtm_tid)
779 continue;
780 if (me->me_tid == logmap->mtm_committid)
781 continue;
782 if (me->me_dt == DT_CANCEL)
783 continue;
784
785 /*
786 * Check if map entry is in use (by lufs_read_strategy())
787 * and if so reset the age bit in the list,
788 * upgrade the lock, and try again
789 */
790 if (me->me_flags & ME_AGE) {
791 for (me = age; me; me = age) {
792 age = me->me_agenext;
793 me->me_flags &= ~ME_AGE;
794 }
795 mutex_exit(&logmap->mtm_mutex);
796 return (1); /* failure */
797 } else {
798 /* add mapentry to age ordered list */
799 logmap_list_age(&age, me);
800 }
801 }
802 if (!age) {
803 goto out;
804 }
805
806 /*
807 * Mark the deltas as being rolled.
808 */
809 for (me = age; me; me = me->me_agenext) {
810 me->me_flags |= ME_ROLL;
811 }
812
813 /*
814 * Test if all deltas are covered by one valid roll buffer
815 */
816 crb = age->me_crb;
817 if (crb && !(crb->c_invalid)) {
818 for (me = age; me; me = me->me_agenext) {
819 if (me->me_crb != crb) {
820 crb = NULL;
821 break;
822 }
823 }
824 rbp->rb_crb = crb;
825 }
826 out:
827 rbp->rb_age = age;
828
829 mutex_exit(&logmap->mtm_mutex);
830
831 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
832 logmap_logscan_debug(logmap, age));
833 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
834 return (0); /* success */
835 }
836
837 void
logmap_list_put_roll(mt_map_t * mtm,mapentry_t * age)838 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
839 {
840 mapentry_t *me;
841
842 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
843 mutex_enter(&mtm->mtm_mutex);
844 for (me = age; me; me = age) {
845 age = me->me_agenext;
846 me->me_flags &= ~ME_AGE;
847 }
848 mutex_exit(&mtm->mtm_mutex);
849 }
850
851 void
logmap_list_put(mt_map_t * mtm,mapentry_t * age)852 logmap_list_put(mt_map_t *mtm, mapentry_t *age)
853 {
854 mapentry_t *me;
855
856 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
857 mutex_enter(&mtm->mtm_mutex);
858 for (me = age; me; me = age) {
859 age = me->me_agenext;
860 me->me_flags &= ~ME_AGE;
861 }
862 mutex_exit(&mtm->mtm_mutex);
863 rw_exit(&mtm->mtm_rwlock);
864 }
865
866 #define UFS_RW_BALANCE 2
867 int ufs_rw_balance = UFS_RW_BALANCE;
868
869 /*
870 * Check if we need to read the master.
871 * The master does not need to be read if the log deltas to the
872 * block are for one contiguous set of full disk sectors.
873 * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
874 * and possibly others should not require master disk reads.
875 * Calculate the sector map for writing later.
876 */
877 int
logmap_setup_read(mapentry_t * age,rollbuf_t * rbp)878 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
879 {
880 offset_t mof;
881 crb_t *crb;
882 mapentry_t *me;
883 int32_t nb;
884 int i;
885 int start_sec, end_sec;
886 int read_needed = 0;
887 int all_inodes = 1;
888 int first_sec = INT_MAX;
889 int last_sec = -1;
890 rbsecmap_t secmap = 0;
891
892 /* LINTED: warning: logical expression always true: op "||" */
893 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));
894
895 for (me = age; me; me = me->me_agenext) {
896 crb = me->me_crb;
897 if (crb) {
898 nb = crb->c_nb;
899 mof = crb->c_mof;
900 } else {
901 nb = me->me_nb;
902 mof = me->me_mof;
903 }
904
905 /*
906 * If the delta is not sector aligned then
907 * read the whole block.
908 */
909 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
910 read_needed = 1;
911 }
912
913 /* Set sector map used in the MAPBLOCKSIZE block. */
914 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
915 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
916 for (i = start_sec; i <= end_sec; i++) {
917 secmap |= UINT16_C(1) << i;
918 }
919
920 if (me->me_dt != DT_INODE) {
921 all_inodes = 0;
922 }
923 if (start_sec < first_sec) {
924 first_sec = start_sec;
925 }
926 if (end_sec > last_sec) {
927 last_sec = end_sec;
928 }
929 }
930
931 ASSERT(secmap);
932 ASSERT(first_sec != INT_MAX);
933 ASSERT(last_sec != -1);
934
935 if (all_inodes) {
936 /*
937 * Here we have a tradeoff choice. It must be better to
938 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
939 * read and a write. But what about 3 or more writes, versus
940 * a read+write? * Where is the cut over? It will depend on
941 * the track caching, scsi driver and other activity.
942 * A unpublished tunable is defined (ufs_rw_balance) that
943 * currently defaults to 2.
944 */
945 if (!read_needed) {
946 int count = 0, gap = 0;
947 int sector_set; /* write needed to this sector */
948
949 /* Count the gaps (every 1 to 0 transation) */
950 for (i = first_sec + 1; i < last_sec; i++) {
951 sector_set = secmap & (UINT16_C(1) << i);
952 if (!gap && !sector_set) {
953 gap = 1;
954 count++;
955 if (count > ufs_rw_balance) {
956 read_needed = 1;
957 break;
958 }
959 } else if (gap && sector_set) {
960 gap = 0;
961 }
962 }
963 }
964
965 /*
966 * Inodes commonly make up the majority (~85%) of deltas.
967 * They cannot contain embedded user data, so its safe to
968 * read and write them all in one IO.
969 * But for directory entries, shadow inode data, and
970 * quota record data the user data fragments can be embedded
971 * betwen those metadata, and so its not safe to read, modify
972 * then write the entire range as user asynchronous user data
973 * writes could get overwritten with old data.
974 * Thus we have to create a segment map of meta data that
975 * needs to get written.
976 *
977 * If user data was logged then this issue would go away.
978 */
979 if (read_needed) {
980 for (i = first_sec + 1; i < last_sec; i++) {
981 secmap |= (UINT16_C(1) << i);
982 }
983 }
984 }
985 rbp->rb_secmap = secmap;
986 return (read_needed);
987 }
988
989 /*
990 * Abort the load of a set of log map delta's.
991 * ie,
992 * Clear out all mapentries on this unit's log map
993 * which have a tid (transaction id) equal to the
994 * parameter tid. Walk the cancel list, taking everything
995 * off it, too.
996 */
997 static void
logmap_abort(ml_unit_t * ul,uint32_t tid)998 logmap_abort(ml_unit_t *ul, uint32_t tid)
999 {
1000 struct mt_map *mtm = ul->un_logmap; /* Log map */
1001 mapentry_t *me, **mep;
1002 int i;
1003
1004 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1005 map_check_linkage(mtm));
1006
1007 /*
1008 * wait for any outstanding reads to finish; lock out future reads
1009 */
1010 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1011
1012 mutex_enter(&mtm->mtm_mutex);
1013 /* Take everything off cancel list */
1014 while ((me = mtm->mtm_cancel) != NULL) {
1015 mtm->mtm_cancel = me->me_cancel;
1016 me->me_flags &= ~ME_CANCEL;
1017 me->me_cancel = NULL;
1018 }
1019
1020 /*
1021 * Now take out all mapentries with current tid, and committid
1022 * as this function is called from logmap_logscan and logmap_commit
1023 * When it is called from logmap_logscan mtm_tid == mtm_committid
1024 * But when logmap_abort is called from logmap_commit it is
1025 * because the log errored when trying to write the commit record,
1026 * after the async ops have been allowed to start in top_end_sync.
1027 * So we also need to remove all mapentries from the transaction whose
1028 * commit failed.
1029 */
1030 for (i = 0; i < mtm->mtm_nhash; i++) {
1031 mep = &mtm->mtm_hash[i];
1032 while ((me = *mep) != NULL) {
1033 if (me->me_tid == tid ||
1034 me->me_tid == mtm->mtm_committid) {
1035 *mep = me->me_hash;
1036 me->me_next->me_prev = me->me_prev;
1037 me->me_prev->me_next = me->me_next;
1038 if (!(me->me_flags & ME_USER)) {
1039 mtm->mtm_nme--;
1040 }
1041 CRB_RELE(me);
1042 kmem_cache_free(mapentry_cache, me);
1043 continue;
1044 }
1045 mep = &me->me_hash;
1046 }
1047 }
1048
1049 if (!(ul->un_flags & LDL_SCAN))
1050 mtm->mtm_flags |= MTM_CANCELED;
1051 mutex_exit(&mtm->mtm_mutex);
1052 mtm->mtm_dirty = 0;
1053 mtm->mtm_nmet = 0;
1054 rw_exit(&mtm->mtm_rwlock);
1055
1056 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1057 map_check_linkage(mtm));
1058 }
1059
1060 static void
logmap_wait_space(mt_map_t * mtm,ml_unit_t * ul,mapentry_t * me)1061 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
1062 {
1063 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1064
1065 while (!ldl_has_space(ul, me)) {
1066 ASSERT(!(ul->un_flags & LDL_NOROLL));
1067 mutex_exit(&ul->un_log_mutex);
1068 logmap_forceroll(mtm);
1069 mutex_enter(&ul->un_log_mutex);
1070 if (ul->un_flags & LDL_ERROR)
1071 break;
1072 }
1073
1074 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1075 }
1076
1077 /*
1078 * put a list of deltas into a logmap
1079 * If va == NULL, don't write to the log.
1080 */
1081 void
logmap_add(ml_unit_t * ul,char * va,offset_t vamof,mapentry_t * melist)1082 logmap_add(
1083 ml_unit_t *ul,
1084 char *va, /* Ptr to buf w/deltas & data */
1085 offset_t vamof, /* Offset on master of buf start */
1086 mapentry_t *melist) /* Entries to add */
1087 {
1088 offset_t mof;
1089 off_t nb;
1090 mapentry_t *me;
1091 mapentry_t **mep;
1092 mapentry_t **savmep;
1093 uint32_t tid;
1094 mt_map_t *mtm = ul->un_logmap;
1095
1096 mutex_enter(&ul->un_log_mutex);
1097 if (va)
1098 logmap_wait_space(mtm, ul, melist);
1099
1100 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1101 map_check_linkage(mtm));
1102
1103 mtm->mtm_ref = 1;
1104 mtm->mtm_dirty++;
1105 tid = mtm->mtm_tid;
1106 while (melist) {
1107 mof = melist->me_mof;
1108 nb = melist->me_nb;
1109
1110 /*
1111 * search for overlaping entries
1112 */
1113 savmep = mep = MAP_HASH(mof, mtm);
1114 mutex_enter(&mtm->mtm_mutex);
1115 while ((me = *mep) != 0) {
1116 /*
1117 * Data consumes old map entry; cancel map entry.
1118 * Take care when we replace an old map entry
1119 * which carries quota information with a newer entry
1120 * which does not. In that case the push function
1121 * would not be called to clean up the dquot structure.
1122 * This would be found later by invalidatedq() causing
1123 * a panic when the filesystem in unmounted.
1124 * We clean up the dquot manually and then replace
1125 * the map entry.
1126 */
1127 if (MEwithinDATA(me, mof, nb) &&
1128 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1129 if (tid == me->me_tid &&
1130 ((me->me_flags & ME_AGE) == 0)) {
1131 *mep = me->me_hash;
1132 me->me_next->me_prev = me->me_prev;
1133 me->me_prev->me_next = me->me_next;
1134 ASSERT(!(me->me_flags & ME_USER));
1135 mtm->mtm_nme--;
1136 /*
1137 * Special case if the mapentry
1138 * carries a dquot and a push function.
1139 * We have to clean up the quota info
1140 * before replacing the mapentry.
1141 */
1142 if (me->me_dt == DT_QR)
1143 HANDLE_DQUOT(me, melist);
1144
1145 kmem_cache_free(mapentry_cache, me);
1146 continue;
1147 }
1148 me->me_cancel = mtm->mtm_cancel;
1149 mtm->mtm_cancel = me;
1150 me->me_flags |= ME_CANCEL;
1151 }
1152 mep = &(*mep)->me_hash;
1153 }
1154 mutex_exit(&mtm->mtm_mutex);
1155
1156 /*
1157 * remove from list
1158 */
1159 me = melist;
1160 melist = melist->me_hash;
1161 me->me_flags &= ~ME_LIST;
1162 /*
1163 * If va != NULL, put in the log.
1164 */
1165 if (va)
1166 ldl_write(ul, va, vamof, me);
1167 if (ul->un_flags & LDL_ERROR) {
1168 kmem_cache_free(mapentry_cache, me);
1169 continue;
1170 }
1171 ASSERT((va == NULL) ||
1172 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1173 map_check_ldl_write(ul, va, vamof, me));
1174
1175 /*
1176 * put on hash
1177 */
1178 mutex_enter(&mtm->mtm_mutex);
1179 me->me_hash = *savmep;
1180 *savmep = me;
1181 me->me_next = (mapentry_t *)mtm;
1182 me->me_prev = mtm->mtm_prev;
1183 mtm->mtm_prev->me_next = me;
1184 mtm->mtm_prev = me;
1185 me->me_flags |= ME_HASH;
1186 me->me_tid = tid;
1187 me->me_age = mtm->mtm_age++;
1188 mtm->mtm_nme++;
1189 mtm->mtm_nmet++;
1190 mutex_exit(&mtm->mtm_mutex);
1191 }
1192
1193 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1194 map_check_linkage(mtm));
1195 mutex_exit(&ul->un_log_mutex);
1196 }
1197
1198 /*
1199 * Add the delta(s) into the log.
1200 * Create one cached roll buffer logmap entry, and reference count the
1201 * number of mapentries refering to it.
1202 * Cancel previous logmap entries.
1203 * logmap_add is tolerant of failure to allocate a cached roll buffer.
1204 */
1205 void
logmap_add_buf(ml_unit_t * ul,char * va,offset_t bufmof,mapentry_t * melist,caddr_t buf,uint32_t bufsz)1206 logmap_add_buf(
1207 ml_unit_t *ul,
1208 char *va, /* Ptr to buf w/deltas & data */
1209 offset_t bufmof, /* Offset on master of buf start */
1210 mapentry_t *melist, /* Entries to add */
1211 caddr_t buf, /* Buffer containing delta(s) */
1212 uint32_t bufsz) /* Size of buf */
1213 {
1214 offset_t mof;
1215 offset_t vamof = bufmof + (va - buf);
1216 off_t nb;
1217 mapentry_t *me;
1218 mapentry_t **mep;
1219 mapentry_t **savmep;
1220 uint32_t tid;
1221 mt_map_t *mtm = ul->un_logmap;
1222 crb_t *crb;
1223 crb_t *crbsav = NULL;
1224
1225 ASSERT((bufsz & DEV_BMASK) == 0);
1226 mutex_enter(&ul->un_log_mutex);
1227 logmap_wait_space(mtm, ul, melist);
1228
1229 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1230 map_check_linkage(mtm));
1231
1232 mtm->mtm_ref = 1;
1233 mtm->mtm_dirty++;
1234 tid = mtm->mtm_tid;
1235 while (melist) {
1236 mof = melist->me_mof;
1237 nb = melist->me_nb;
1238
1239 /*
1240 * search for overlapping entries
1241 */
1242 savmep = mep = MAP_HASH(mof, mtm);
1243 mutex_enter(&mtm->mtm_mutex);
1244 while ((me = *mep) != 0) {
1245 /*
1246 * Data consumes old map entry; cancel map entry.
1247 * Take care when we replace an old map entry
1248 * which carries quota information with a newer entry
1249 * which does not. In that case the push function
1250 * would not be called to clean up the dquot structure.
1251 * This would be found later by invalidatedq() causing
1252 * a panic when the filesystem in unmounted.
1253 * We clean up the dquot manually and then replace
1254 * the map entry.
1255 */
1256 crb = me->me_crb;
1257 if (MEwithinDATA(me, mof, nb) &&
1258 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1259 if (tid == me->me_tid &&
1260 ((me->me_flags & ME_AGE) == 0)) {
1261 *mep = me->me_hash;
1262 me->me_next->me_prev = me->me_prev;
1263 me->me_prev->me_next = me->me_next;
1264 ASSERT(!(me->me_flags & ME_USER));
1265 mtm->mtm_nme--;
1266 /*
1267 * Special case if the mapentry
1268 * carries a dquot and a push function.
1269 * We have to clean up the quota info
1270 * before replacing the mapentry.
1271 */
1272 if (me->me_dt == DT_QR)
1273 HANDLE_DQUOT(me, melist);
1274
1275 /*
1276 * If this soon to be deleted mapentry
1277 * has a suitable roll buffer then
1278 * re-use it.
1279 */
1280 if (crb && (--crb->c_refcnt == 0)) {
1281 if (crbsav ||
1282 (crb->c_nb != bufsz)) {
1283 CRB_FREE(crb, me);
1284 } else {
1285 bcopy(buf, crb->c_buf,
1286 bufsz);
1287 crb->c_invalid = 0;
1288 crb->c_mof = bufmof;
1289 crbsav = crb;
1290 me->me_crb = NULL;
1291 }
1292 }
1293 kmem_cache_free(mapentry_cache, me);
1294 continue;
1295 }
1296 me->me_cancel = mtm->mtm_cancel;
1297 mtm->mtm_cancel = me;
1298 me->me_flags |= ME_CANCEL;
1299 }
1300
1301 /*
1302 * Inode deltas within the same fs block come
1303 * in individually as separate calls to logmap_add().
1304 * All others come in as one call. So check for an
1305 * existing entry where we can re-use the crb.
1306 */
1307 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
1308 !crbsav && crb &&
1309 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
1310 ASSERT(crb->c_mof == bufmof);
1311 ASSERT(crb->c_nb == bufsz);
1312 bcopy(buf, crb->c_buf, bufsz);
1313 crbsav = crb;
1314 }
1315 mep = &(*mep)->me_hash;
1316 }
1317 mutex_exit(&mtm->mtm_mutex);
1318
1319 /*
1320 * If we don't already have a crb then allocate one
1321 * and copy the incoming buffer. Only do this once
1322 * for all the incoming deltas.
1323 */
1324 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
1325 /*
1326 * Only use a cached roll buffer if we
1327 * have enough memory, and check for failures.
1328 */
1329 if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
1330 (kmem_avail() > bufsz)) {
1331 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
1332 } else {
1333 ufs_crb_alloc_fails++;
1334 }
1335 if (crbsav) {
1336 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
1337 if (crbsav->c_buf) {
1338 atomic_add_64(&ufs_crb_size,
1339 (uint64_t)bufsz);
1340 if (ufs_crb_size > ufs_crb_max_size) {
1341 ufs_crb_max_size = ufs_crb_size;
1342 }
1343 bcopy(buf, crbsav->c_buf, bufsz);
1344 crbsav->c_nb = bufsz;
1345 crbsav->c_refcnt = 0;
1346 crbsav->c_invalid = 0;
1347 ASSERT((bufmof & DEV_BMASK) == 0);
1348 crbsav->c_mof = bufmof;
1349 } else {
1350 kmem_free(crbsav, sizeof (crb_t));
1351 crbsav = NULL;
1352 }
1353 }
1354 }
1355
1356 /*
1357 * remove from list
1358 */
1359 me = melist;
1360 melist = melist->me_hash;
1361 me->me_flags &= ~ME_LIST;
1362 me->me_crb = crbsav;
1363 if (crbsav) {
1364 crbsav->c_refcnt++;
1365 }
1366 crbsav = NULL;
1367
1368 ASSERT(va);
1369 ldl_write(ul, va, vamof, me); /* add to on-disk log */
1370 if (ul->un_flags & LDL_ERROR) {
1371 CRB_RELE(me);
1372 kmem_cache_free(mapentry_cache, me);
1373 continue;
1374 }
1375 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1376 map_check_ldl_write(ul, va, vamof, me));
1377
1378 /*
1379 * put on hash
1380 */
1381 mutex_enter(&mtm->mtm_mutex);
1382 me->me_hash = *savmep;
1383 *savmep = me;
1384 me->me_next = (mapentry_t *)mtm;
1385 me->me_prev = mtm->mtm_prev;
1386 mtm->mtm_prev->me_next = me;
1387 mtm->mtm_prev = me;
1388 me->me_flags |= ME_HASH;
1389 me->me_tid = tid;
1390 me->me_age = mtm->mtm_age++;
1391 mtm->mtm_nme++;
1392 mtm->mtm_nmet++;
1393 mutex_exit(&mtm->mtm_mutex);
1394 }
1395
1396 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1397 map_check_linkage(mtm));
1398 mutex_exit(&ul->un_log_mutex);
1399 }
1400
1401 /*
1402 * free up any cancelled deltas
1403 */
1404 void
logmap_free_cancel(mt_map_t * mtm,mapentry_t ** cancelhead)1405 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
1406 {
1407 int dolock = 0;
1408 mapentry_t *me;
1409 mapentry_t **mep;
1410
1411 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1412 map_check_linkage(mtm));
1413
1414 again:
1415 if (dolock)
1416 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1417
1418 /*
1419 * At EOT, cancel the indicated deltas
1420 */
1421 mutex_enter(&mtm->mtm_mutex);
1422 if (mtm->mtm_flags & MTM_CANCELED) {
1423 mtm->mtm_flags &= ~MTM_CANCELED;
1424 ASSERT(dolock == 0);
1425 mutex_exit(&mtm->mtm_mutex);
1426 return;
1427 }
1428
1429 while ((me = *cancelhead) != NULL) {
1430 /*
1431 * roll forward or read collision; wait and try again
1432 */
1433 if (me->me_flags & ME_AGE) {
1434 ASSERT(dolock == 0);
1435 mutex_exit(&mtm->mtm_mutex);
1436 dolock = 1;
1437 goto again;
1438 }
1439 /*
1440 * remove from cancel list
1441 */
1442 *cancelhead = me->me_cancel;
1443 me->me_cancel = NULL;
1444 me->me_flags &= ~(ME_CANCEL);
1445
1446 /*
1447 * logmap_remove_roll handles ME_ROLL entries later
1448 * we leave them around for logmap_iscancel
1449 * XXX is this necessary?
1450 */
1451 if (me->me_flags & ME_ROLL)
1452 continue;
1453
1454 /*
1455 * remove from hash (if necessary)
1456 */
1457 if (me->me_flags & ME_HASH) {
1458 mep = MAP_HASH(me->me_mof, mtm);
1459 while (*mep) {
1460 if (*mep == me) {
1461 *mep = me->me_hash;
1462 me->me_next->me_prev = me->me_prev;
1463 me->me_prev->me_next = me->me_next;
1464 me->me_flags &= ~(ME_HASH);
1465 if (!(me->me_flags & ME_USER)) {
1466 mtm->mtm_nme--;
1467 }
1468 break;
1469 } else
1470 mep = &(*mep)->me_hash;
1471 }
1472 }
1473 /*
1474 * put the entry on the free list
1475 */
1476 CRB_RELE(me);
1477 kmem_cache_free(mapentry_cache, me);
1478 }
1479 mutex_exit(&mtm->mtm_mutex);
1480 if (dolock)
1481 rw_exit(&mtm->mtm_rwlock);
1482
1483 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1484 map_check_linkage(mtm));
1485 }
1486
1487
1488 void
logmap_commit(ml_unit_t * ul,uint32_t tid)1489 logmap_commit(ml_unit_t *ul, uint32_t tid)
1490 {
1491 mapentry_t me;
1492 mt_map_t *mtm = ul->un_logmap;
1493
1494
1495 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1496
1497 /*
1498 * async'ly write a commit rec into the log
1499 */
1500 if (mtm->mtm_dirty) {
1501 /*
1502 * put commit record into log
1503 */
1504 me.me_mof = mtm->mtm_tid;
1505 me.me_dt = DT_COMMIT;
1506 me.me_nb = 0;
1507 me.me_hash = NULL;
1508 logmap_wait_space(mtm, ul, &me);
1509 ldl_write(ul, NULL, (offset_t)0, &me);
1510 ldl_round_commit(ul);
1511
1512 /*
1513 * abort on error; else reset dirty flag
1514 */
1515 if (ul->un_flags & LDL_ERROR)
1516 logmap_abort(ul, tid);
1517 else {
1518 mtm->mtm_dirty = 0;
1519 mtm->mtm_nmet = 0;
1520 mtm->mtm_cfrags = 0;
1521 }
1522 /* push commit */
1523 ldl_push_commit(ul);
1524 }
1525 }
1526
1527 void
logmap_sethead(mt_map_t * mtm,ml_unit_t * ul)1528 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
1529 {
1530 off_t lof;
1531 uint32_t tid;
1532 mapentry_t *me;
1533
1534 /*
1535 * move the head forward so the log knows how full it is
1536 * Make sure to skip any mapentry whose me_lof is 0, these
1537 * are just place holders for DT_CANCELED freed user blocks
1538 * for the current moby.
1539 */
1540 mutex_enter(&ul->un_log_mutex);
1541 mutex_enter(&mtm->mtm_mutex);
1542 me = mtm->mtm_next;
1543 while (me != (mapentry_t *)mtm && me->me_lof == 0) {
1544 me = me->me_next;
1545 }
1546
1547 if (me == (mapentry_t *)mtm)
1548 lof = -1;
1549 else {
1550 lof = me->me_lof;
1551 tid = me->me_tid;
1552 }
1553 mutex_exit(&mtm->mtm_mutex);
1554 ldl_sethead(ul, lof, tid);
1555 if (lof == -1)
1556 mtm->mtm_age = 0;
1557 mutex_exit(&ul->un_log_mutex);
1558 }
1559
1560 void
logmap_settail(mt_map_t * mtm,ml_unit_t * ul)1561 logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
1562 {
1563 off_t lof;
1564 size_t nb;
1565
1566 /*
1567 * set the tail after the logmap_abort
1568 */
1569 mutex_enter(&ul->un_log_mutex);
1570 mutex_enter(&mtm->mtm_mutex);
1571 if (mtm->mtm_prev == (mapentry_t *)mtm)
1572 lof = -1;
1573 else {
1574 /*
1575 * set the tail to the end of the last commit
1576 */
1577 lof = mtm->mtm_tail_lof;
1578 nb = mtm->mtm_tail_nb;
1579 }
1580 mutex_exit(&mtm->mtm_mutex);
1581 ldl_settail(ul, lof, nb);
1582 mutex_exit(&ul->un_log_mutex);
1583 }
1584
1585 /*
1586 * when reseting a device; roll the log until every
1587 * delta has been rolled forward
1588 */
1589 void
logmap_roll_dev(ml_unit_t * ul)1590 logmap_roll_dev(ml_unit_t *ul)
1591 {
1592 mt_map_t *mtm = ul->un_logmap;
1593 mapentry_t *me;
1594 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
1595
1596 again:
1597 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1598 map_check_linkage(mtm));
1599 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
1600 return;
1601
1602 /*
1603 * look for deltas
1604 */
1605 mutex_enter(&mtm->mtm_mutex);
1606 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
1607 if (me->me_flags & ME_ROLL)
1608 break;
1609 if (me->me_tid == mtm->mtm_tid)
1610 continue;
1611 if (me->me_tid == mtm->mtm_committid)
1612 continue;
1613 break;
1614 }
1615
1616 /*
1617 * found a delta; kick the roll thread
1618 * but only if the thread is running... (jmh)
1619 */
1620 if (me != (mapentry_t *)mtm) {
1621 mutex_exit(&mtm->mtm_mutex);
1622 logmap_forceroll(mtm);
1623 goto again;
1624 }
1625
1626 /*
1627 * no more deltas, return
1628 */
1629 mutex_exit(&mtm->mtm_mutex);
1630 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);
1631
1632 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1633 map_check_linkage(mtm));
1634 }
1635
1636 static void
logmap_cancel_delta(ml_unit_t * ul,offset_t mof,int32_t nb,int metadata)1637 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
1638 {
1639 mapentry_t *me;
1640 mapentry_t **mep;
1641 mt_map_t *mtm = ul->un_logmap;
1642 int frags;
1643
1644 /*
1645 * map has been referenced and is dirty
1646 */
1647 mtm->mtm_ref = 1;
1648 mtm->mtm_dirty++;
1649
1650 /*
1651 * get a mapentry
1652 */
1653 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1654 bzero(me, sizeof (mapentry_t));
1655
1656 /*
1657 * initialize cancel record and put in logmap
1658 */
1659 me->me_mof = mof;
1660 me->me_nb = nb;
1661 me->me_dt = DT_CANCEL;
1662 me->me_tid = mtm->mtm_tid;
1663 me->me_hash = NULL;
1664
1665 /*
1666 * Write delta to log if this delta is for metadata. If this is not
1667 * metadata it is user data and we are just putting a cancel
1668 * mapentry into the hash to cancel a user block deletion
1669 * in which we do not want the block to be allocated
1670 * within this moby. This cancel entry will prevent the block from
1671 * being allocated within the moby and prevent user data corruption
1672 * if we happen to crash before this moby is committed.
1673 */
1674 mutex_enter(&ul->un_log_mutex);
1675 if (metadata) {
1676 logmap_wait_space(mtm, ul, me);
1677 ldl_write(ul, NULL, (offset_t)0, me);
1678 if (ul->un_flags & LDL_ERROR) {
1679 kmem_cache_free(mapentry_cache, me);
1680 mutex_exit(&ul->un_log_mutex);
1681 return;
1682 }
1683 }
1684
1685 /*
1686 * put in hash and on cancel list
1687 */
1688 mep = MAP_HASH(mof, mtm);
1689 mutex_enter(&mtm->mtm_mutex);
1690 me->me_age = mtm->mtm_age++;
1691 me->me_hash = *mep;
1692 *mep = me;
1693 me->me_next = (mapentry_t *)mtm;
1694 me->me_prev = mtm->mtm_prev;
1695 mtm->mtm_prev->me_next = me;
1696 mtm->mtm_prev = me;
1697 me->me_cancel = mtm->mtm_cancel;
1698 mtm->mtm_cancel = me;
1699 if (metadata) {
1700 mtm->mtm_nme++;
1701 mtm->mtm_nmet++;
1702 } else {
1703 me->me_flags = ME_USER;
1704 }
1705 me->me_flags |= (ME_HASH|ME_CANCEL);
1706 if (!(metadata)) {
1707 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
1708 if (frags)
1709 mtm->mtm_cfrags +=
1710 numfrags(ul->un_ufsvfs->vfs_fs, frags);
1711 }
1712 mutex_exit(&mtm->mtm_mutex);
1713
1714 mutex_exit(&ul->un_log_mutex);
1715 }
1716
1717 /*
1718 * cancel entries in a logmap (entries are freed at EOT)
1719 */
1720 void
logmap_cancel(ml_unit_t * ul,offset_t mof,off_t nb,int metadata)1721 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
1722 {
1723 int32_t hnb;
1724 mapentry_t *me;
1725 mapentry_t **mep;
1726 mt_map_t *mtm = ul->un_logmap;
1727 crb_t *crb;
1728
1729 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1730 map_check_linkage(mtm));
1731
1732 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1733 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1734 if (hnb > nb)
1735 hnb = nb;
1736 /*
1737 * Find overlapping metadata entries. Don't search through
1738 * the hash chains if this is user data because it is only
1739 * possible to have overlapping map entries for metadata,
1740 * and the search can become expensive for large files.
1741 */
1742 if (metadata) {
1743 mep = MAP_HASH(mof, mtm);
1744 mutex_enter(&mtm->mtm_mutex);
1745 for (me = *mep; me; me = me->me_hash) {
1746 if (!DATAoverlapME(mof, hnb, me))
1747 continue;
1748
1749 ASSERT(MEwithinDATA(me, mof, hnb));
1750
1751 if ((me->me_flags & ME_CANCEL) == 0) {
1752 me->me_cancel = mtm->mtm_cancel;
1753 mtm->mtm_cancel = me;
1754 me->me_flags |= ME_CANCEL;
1755 crb = me->me_crb;
1756 if (crb) {
1757 crb->c_invalid = 1;
1758 }
1759 }
1760 }
1761 mutex_exit(&mtm->mtm_mutex);
1762 }
1763
1764 /*
1765 * put a cancel record into the log
1766 */
1767 logmap_cancel_delta(ul, mof, hnb, metadata);
1768 }
1769
1770 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1771 map_check_linkage(mtm));
1772 }
1773
1774 /*
1775 * check for overlap w/cancel delta
1776 */
1777 int
logmap_iscancel(mt_map_t * mtm,offset_t mof,off_t nb)1778 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
1779 {
1780 off_t hnb;
1781 mapentry_t *me;
1782 mapentry_t **mep;
1783
1784 mutex_enter(&mtm->mtm_mutex);
1785 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1786 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1787 if (hnb > nb)
1788 hnb = nb;
1789 /*
1790 * search for dup entry
1791 */
1792 mep = MAP_HASH(mof, mtm);
1793 for (me = *mep; me; me = me->me_hash) {
1794 if (((me->me_flags & ME_ROLL) == 0) &&
1795 (me->me_dt != DT_CANCEL))
1796 continue;
1797 if (DATAoverlapME(mof, hnb, me))
1798 break;
1799 }
1800
1801 /*
1802 * overlap detected
1803 */
1804 if (me) {
1805 mutex_exit(&mtm->mtm_mutex);
1806 return (1);
1807 }
1808 }
1809 mutex_exit(&mtm->mtm_mutex);
1810 return (0);
1811 }
1812
1813 static int
logmap_logscan_add(ml_unit_t * ul,struct delta * dp,off_t lof,size_t * nbp)1814 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
1815 {
1816 mapentry_t *me;
1817 int error;
1818 mt_map_t *mtm = ul->un_logmap;
1819
1820 /*
1821 * verify delta header; failure == mediafail
1822 */
1823 error = 0;
1824 /* delta type */
1825 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
1826 error = EINVAL;
1827 if (dp->d_typ == DT_COMMIT) {
1828 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
1829 error = EINVAL;
1830 } else {
1831 /* length of delta */
1832 if ((dp->d_nb < INT32_C(0)) ||
1833 (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
1834 error = EINVAL;
1835
1836 /* offset on master device */
1837 if (dp->d_mof < INT64_C(0))
1838 error = EINVAL;
1839 }
1840
1841 if (error) {
1842 ldl_seterror(ul, "Error processing ufs log data during scan");
1843 return (error);
1844 }
1845
1846 /*
1847 * process commit record
1848 */
1849 if (dp->d_typ == DT_COMMIT) {
1850 if (mtm->mtm_dirty) {
1851 ASSERT(dp->d_nb == INT32_C(0));
1852 logmap_free_cancel(mtm, &mtm->mtm_cancel);
1853 mtm->mtm_dirty = 0;
1854 mtm->mtm_nmet = 0;
1855 mtm->mtm_tid++;
1856 mtm->mtm_committid = mtm->mtm_tid;
1857 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1858 logmap_logscan_commit_debug(lof, mtm));
1859 }
1860 /*
1861 * return #bytes to next sector (next delta header)
1862 */
1863 *nbp = ldl_logscan_nbcommit(lof);
1864 mtm->mtm_tail_lof = lof;
1865 mtm->mtm_tail_nb = *nbp;
1866 return (0);
1867 }
1868
1869 /*
1870 * add delta to logmap
1871 */
1872 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1873 bzero(me, sizeof (mapentry_t));
1874 me->me_lof = lof;
1875 me->me_mof = dp->d_mof;
1876 me->me_nb = dp->d_nb;
1877 me->me_tid = mtm->mtm_tid;
1878 me->me_dt = dp->d_typ;
1879 me->me_hash = NULL;
1880 me->me_flags = (ME_LIST | ME_SCAN);
1881 logmap_add(ul, NULL, 0, me);
1882 switch (dp->d_typ) {
1883 case DT_CANCEL:
1884 me->me_flags |= ME_CANCEL;
1885 me->me_cancel = mtm->mtm_cancel;
1886 mtm->mtm_cancel = me;
1887 break;
1888 default:
1889 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1890 logmap_logscan_add_debug(dp, mtm));
1891 break;
1892 }
1893
1894 sizeofdelta:
1895 /*
1896 * return #bytes till next delta header
1897 */
1898 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
1899 *nbp = 0;
1900 else
1901 *nbp = dp->d_nb;
1902 return (0);
1903 }
1904
1905 void
logmap_logscan(ml_unit_t * ul)1906 logmap_logscan(ml_unit_t *ul)
1907 {
1908 size_t nb, nbd;
1909 off_t lof;
1910 struct delta delta;
1911 mt_map_t *logmap = ul->un_logmap;
1912
1913 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);
1914
1915 /*
1916 * prepare the log for a logscan
1917 */
1918 ldl_logscan_begin(ul);
1919
1920 /*
1921 * prepare the logmap for a logscan
1922 */
1923 (void) map_free_entries(logmap);
1924 logmap->mtm_tid = 0;
1925 logmap->mtm_committid = UINT32_C(0);
1926 logmap->mtm_age = 0;
1927 logmap->mtm_dirty = 0;
1928 logmap->mtm_ref = 0;
1929
1930 /*
1931 * while not at end of log
1932 * read delta header
1933 * add to logmap
1934 * seek to beginning of next delta
1935 */
1936 lof = ul->un_head_lof;
1937 nbd = sizeof (delta);
1938 while (lof != ul->un_tail_lof) {
1939
1940 /* read delta header */
1941 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
1942 break;
1943
1944 /* add to logmap */
1945 if (logmap_logscan_add(ul, &delta, lof, &nb))
1946 break;
1947
1948 /* seek to next header (skip data) */
1949 if (ldl_logscan_read(ul, &lof, nb, NULL))
1950 break;
1951 }
1952
1953 /*
1954 * remove the last partial transaction from the logmap
1955 */
1956 logmap_abort(ul, logmap->mtm_tid);
1957
1958 ldl_logscan_end(ul);
1959 }
1960
1961 void
_init_map(void)1962 _init_map(void)
1963 {
1964 /*
1965 * Initialise the mapentry cache. No constructor or deconstructor
1966 * is needed. Also no reclaim function is supplied as reclaiming
1967 * current entries is not possible.
1968 */
1969 mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
1970 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1971 }
1972
1973 /*
1974 * Special case when we replace an old map entry which carries quota
1975 * information with a newer entry which does not.
1976 * In that case the push function would not be called to clean up the
1977 * dquot structure. This would be found later by invalidatedq() causing
1978 * a panic when the filesystem in unmounted.
1979 * We clean up the dquot manually before replacing the map entry.
1980 */
1981 void
handle_dquot(mapentry_t * me)1982 handle_dquot(mapentry_t *me)
1983 {
1984 int dolock = 0;
1985 int domutex = 0;
1986 struct dquot *dqp;
1987
1988 dqp = (struct dquot *)me->me_arg;
1989
1990 /*
1991 * We need vfs_dqrwlock to call dqput()
1992 */
1993 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
1994 if (dolock)
1995 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);
1996
1997 domutex = (!MUTEX_HELD(&dqp->dq_lock));
1998 if (domutex)
1999 mutex_enter(&dqp->dq_lock);
2000
2001 /*
2002 * Only clean up if the dquot is referenced
2003 */
2004 if (dqp->dq_cnt == 0) {
2005 if (domutex)
2006 mutex_exit(&dqp->dq_lock);
2007 if (dolock)
2008 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2009 return;
2010 }
2011
2012 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
2013 dqput(dqp);
2014
2015 if (domutex)
2016 mutex_exit(&dqp->dq_lock);
2017
2018 if (dolock)
2019 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2020
2021 }
2022