/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* handy */ #define abs(x) ((x) < 0? -(x): (x)) #if defined(DEBUG) #define DBGLVL_NONE 0x00000000 #define DBGLVL_MAJOR 0x00000100 #define DBGLVL_MINOR 0x00000200 #define DBGLVL_MINUTE 0x00000400 #define DBGLVL_TRIVIA 0x00000800 #define DBGLVL_HIDEOUS 0x00001000 #define DBGFLG_NONE 0x00000000 #define DBGFLG_NOPANIC 0x00000001 #define DBGFLG_LVLONLY 0x00000002 #define DBGFLG_FIXWOULDPANIC 0x00000004 #define DBGFLG_FLAGMASK 0x0000000F #define DBGFLG_LEVELMASK ~DBGFLG_FLAGMASK #define DEBUG_FLAGS (ufs_fix_failure_dbg & DBGFLG_FLAGMASK) #define DEBUG_LEVEL (ufs_fix_failure_dbg & DBGFLG_LEVELMASK) unsigned int ufs_fix_failure_dbg = DBGLVL_NONE | DBGFLG_NONE; #define DCALL(dbg_level, call) \ { \ if (DEBUG_LEVEL != DBGLVL_NONE) { \ if (DEBUG_FLAGS & DBGFLG_LVLONLY) { \ if (DEBUG_LEVEL & dbg_level) { \ call; \ } \ } else { \ if (dbg_level <= DEBUG_LEVEL) { \ call; \ } \ } \ } \ } #define DPRINTF(dbg_level, msg) DCALL(dbg_level, printf msg) #define MAJOR(msg) DPRINTF(DBGLVL_MAJOR, msg) #define MINOR(msg) DPRINTF(DBGLVL_MINOR, msg) #define MINUTE(msg) DPRINTF(DBGLVL_MINUTE, msg) #define TRIVIA(msg) DPRINTF(DBGLVL_TRIVIA, msg) #define HIDEOUS(msg) DPRINTF(DBGLVL_HIDEOUS, msg) #else /* !DEBUG */ #define DCALL(ignored_dbg_level, ignored_routine) #define MAJOR(ignored) #define MINOR(ignored) #define MINUTE(ignored) #define TRIVIA(ignored) #define HIDEOUS(ignored) #endif /* DEBUG */ #define NULLSTR(str) (!(str) || *(str) == '\0'? "" : (str)) #define NULSTRING "" /* somewhat arbitrary limits, in seconds */ /* all probably ought to be different, but these are convenient for debugging */ const time_t UF_TOO_LONG = 128; /* max. wait for fsck start */ /* all of these are in units of seconds used for retry period while ... */ const time_t UF_FIXSTART_PERIOD = 16; /* awaiting fsck start */ const time_t UF_FIXPOLL_PERIOD = 256; /* awaiting fsck finish */ const time_t UF_SHORT_ERROR_PERIOD = 4; /* after (lockfs) error */ const time_t UF_LONG_ERROR_PERIOD = 512; /* after (lockfs) error */ #define NO_ERROR 0 #define LOCKFS_OLOCK LOCKFS_MAXLOCK+1 const ulong_t GB = 1024 * 1024 * 1024; const ulong_t SecondsPerGig = 1024; /* ~17 minutes (overestimate) */ /* * per filesystem flags */ const int UFSFX_PANIC = (UFSMNT_ONERROR_PANIC >> 4); const int UFSFX_LCKONLY = (UFSMNT_ONERROR_LOCK >> 4); const int UFSFX_LCKUMOUNT = (UFSMNT_ONERROR_UMOUNT >> 4); const int UFSFX_DEFAULT = (UFSMNT_ONERROR_DEFAULT >> 4); const int UFSFX_REPAIR_START = 0x10000000; /* return protocols */ typedef enum triage_return_code { TRIAGE_DEAD = -1, TRIAGE_NO_SPIRIT, TRIAGE_ATTEND_TO } triage_t; typedef enum statefunc_return_code { SFRC_SUCCESS = 1, SFRC_FAIL = 0 } sfrc_t; /* external references */ /* in ufs_thread.c */ extern int ufs_thread_run(struct ufs_q *, callb_cpr_t *cprinfop); extern int ufs_checkaccton(vnode_t *); /* in ufs_lockfs.c */ extern int ufs_checkswapon(vnode_t *); /* in ufs_lockfs.c */ extern struct pollhead ufs_pollhd; /* in ufs_vnops.c */ /* globals */ struct ufs_q ufs_fix; /* * patchable constants: * These are set in ufsfx_init() [called at modload] */ struct ufs_failure_tunable { long uft_too_long; /* limit repair startup time */ long uft_fixstart_period; /* pre-repair start period */ long uft_fixpoll_period; /* post-fsck start period */ long uft_short_err_period; /* post-error short period */ long uft_long_err_period; /* post-error long period */ } ufsfx_tune; /* internal statistics of events */ struct uf_statistics { ulong_t ufst_lock_violations; ulong_t ufst_current_races; ulong_t ufst_unmount_failures; ulong_t ufst_num_fixed; ulong_t ufst_num_failed; ulong_t ufst_cpu_waste; time_t ufst_last_start_tm; kmutex_t ufst_mutex; } uf_stats; typedef enum state_action { UFA_ERROR = -1, /* internal error */ UFA_FOUND, /* found uf in state */ UFA_SET /* change uf to state */ } ufsa_t; /* state definition */ typedef struct uf_state_desc { int ud_v; /* value */ char *ud_name; /* name */ sfrc_t (*ud_sfp)(ufs_failure_t *, ufsa_t, ufs_failure_states_t); /* per-state actions */ ufs_failure_states_t ud_prev; /* valid prev. states */ struct uf_state_desc_attr { unsigned terminal:1; /* no action req. if found */ unsigned at_fail:1; /* state set by thread */ /* encountering the error */ unsigned unused; } ud_attr; } ufsd_t; /* * forward references */ /* thread to watch for failures */ static void ufsfx_thread_fix_failures(void *); static int ufsfx_do_failure_q(void); static void ufsfx_kill_fix_failure_thread(void *); /* routines called when failure occurs */ static int ufs_fault_v(vnode_t *, char *, va_list) __KVPRINTFLIKE(2); static ufs_failure_t *init_failure(vnode_t *, char *, va_list) __KVPRINTFLIKE(2); static void queue_failure(ufs_failure_t *); /*PRINTFLIKE2*/ static void real_panic(ufs_failure_t *, const char *, ...) __KPRINTFLIKE(2); static void real_panic_v(ufs_failure_t *, const char *, va_list) __KVPRINTFLIKE(2); static triage_t triage(vnode_t *); /* routines called when failure record is acted upon */ static sfrc_t set_state(ufs_failure_t *, ufs_failure_states_t); static int state_trans_valid(ufs_failure_states_t, ufs_failure_states_t); static int terminal_state(ufs_failure_states_t); /* routines called when states entered/found */ static sfrc_t sf_minimum(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_undef(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_init(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_queue(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_found_queue(ufs_failure_t *); static sfrc_t sf_nonterm_cmn(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_term_cmn(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_panic(ufs_failure_t *, ufsa_t, ufs_failure_states_t); static sfrc_t sf_set_trylck(ufs_failure_t *); static sfrc_t sf_set_locked(ufs_failure_t *); static sfrc_t sf_found_trylck(ufs_failure_t *); static sfrc_t sf_found_lock_fix_cmn(ufs_failure_t *, ufs_failure_states_t); static sfrc_t sf_found_umount(ufs_failure_t *); /* support routines, called by sf_nonterm_cmn and sf_term_cmn */ static time_t trylock_time_exceeded(ufs_failure_t *); static void pester_msg(ufs_failure_t *, int); static int get_lockfs_status(ufs_failure_t *, struct lockfs *); static void alloc_lockfs_comment(ufs_failure_t *, struct lockfs *); static int set_lockfs(ufs_failure_t *, struct lockfs *); static int lockfs_failure(ufs_failure_t *); static int lockfs_success(ufs_failure_t *); static int fsck_active(ufs_failure_t *); /* low-level support routines */ static ufsd_t *get_state_desc(ufs_failure_states_t); static char *fs_name(ufs_failure_t *); #if defined(DEBUG) static char *state_name(ufs_failure_states_t); static char *lock_name(struct lockfs *); static char *err_name(int); static char *act_name(ufsa_t); static void dump_uf_list(char *msg); static void dump_uf(ufs_failure_t *, int i); #endif /* DEBUG */ /* * * State Transitions: * * normally: * if flagged to be locked but not unmounted: (UFSMNT_ONERROR_LOCK) * UNDEF -> INIT -> QUEUE -> TRYLCK -> LOCKED -> FIXING -> FIXED * * The only difference between these two is that the fsck must be started * manually. * * if flagged to be unmounted: (UFSMNT_ONERROR_UMOUNT) * UNDEF -> INIT -> QUEUE -> TRYLCK -> LOCKED -> UMOUNT -> NOTFIX * * if flagged to panic: (UFSMNT_ONERROR_PANIC) * UNDEF -> INIT -> PANIC * * if a secondary panic on a file system which has an active failure * record: * UNDEF -> INIT -> QUEUE -> REPLICA * * UNDEF, INIT, QUEUE all are set in the context of the failing thread. * All other states (except possibly PANIC) are set in by the monitor * (lock) thread. * */ ufsd_t state_desc[] = { { UF_ILLEGAL, "in an unknown state", sf_minimum, UF_ILLEGAL, { 0, 1, 0 } }, { UF_UNDEF, "undefined", sf_undef, UF_UNDEF, { 0, 1, 0 } }, { UF_INIT, "being initialized", sf_init, UF_UNDEF, { 0, 1, 0 } }, { UF_QUEUE, "queued", sf_queue, UF_INIT, { 0, 1, 0 } }, { UF_TRYLCK, "trying to be locked", sf_nonterm_cmn, UF_QUEUE, { 0, 0, 0 } }, { UF_LOCKED, "locked", sf_nonterm_cmn, UF_TRYLCK | UF_FIXING, { 0, 0, 0 } }, { UF_UMOUNT, "being unmounted", sf_nonterm_cmn, #if defined(DEBUG) UF_PANIC | #endif /* DEBUG */ UF_TRYLCK | UF_LOCKED, { 0, 0, 0 } }, { UF_FIXING, "being fixed", sf_nonterm_cmn, UF_LOCKED, { 0, 0, 0 } }, { UF_FIXED, "fixed", sf_term_cmn, UF_FIXING, { 1, 0, 0 } }, { UF_NOTFIX, "not fixed", sf_term_cmn, #if defined(DEBUG) UF_PANIC | #endif /* DEBUG */ UF_QUEUE | UF_TRYLCK | UF_LOCKED | UF_UMOUNT | UF_FIXING, { 1, 0, 0 } }, { UF_REPLICA, "a replica", sf_term_cmn, UF_QUEUE, { 1, 0, 0 } }, { UF_PANIC, "panicking", sf_panic, /* XXX make this narrower */ UF_ALLSTATES, { 0, 0, 0 } }, { UF_UNDEF, NULL, ((sfrc_t (*)()) NULL), UF_UNDEF, { 0, 0, 0 } } }; /* unified collection */ struct ufsfx_info { struct uf_statistics *ufi_statp; struct ufs_failure_tunable *ufi_tunep; ufsd_t *ufi_statetab; } uffsinfo; #if defined(DEBUG) struct action_description { ufsa_t ad_v; char *ad_name; }; #define EUNK (-1) struct error_description { int ed_errno; char *ed_name; } err_desc[] = { { EUNK, "" }, { EINVAL, "EINVAL" }, { EACCES, "EACCES" }, { EPERM, "EPERM" }, { EIO, "EIO" }, { EDEADLK, "EDEADLK" }, { EBUSY, "EBUSY" }, { EAGAIN, "EAGAIN" }, { ERESTART, "ERESTART" }, { ETIMEDOUT, "ETIMEDOUT" }, { NO_ERROR, "Ok" }, { EUNK, NULL } }; struct action_description act_desc[] = { { UFA_ERROR, "" }, { UFA_FOUND, "\"found\"" }, { UFA_SET, "\"set\"" }, { UFA_ERROR, NULL }, }; #define LOCKFS_BADLOCK (-1) struct lock_description { int ld_type; char *ld_name; } lock_desc[] = { { LOCKFS_BADLOCK, "" }, { LOCKFS_ULOCK, "Unlock" }, { LOCKFS_ELOCK, "Error Lock" }, { LOCKFS_HLOCK, "Hard Lock" }, { LOCKFS_OLOCK, "Old Lock" }, { LOCKFS_BADLOCK, NULL } }; #endif /* DEBUG */ /* * ufs_fault, ufs_fault_v * * called instead of cmn_err(CE_PANIC, ...) by ufs routines * when a failure is detected to put the file system into an * error state (if possible) or to devolve to a panic otherwise * * vnode is some vnode in this file system, used to find the way * to ufsvfs, vfsp etc. Since a panic can be called from many * levels, the vnode is the most convenient hook to pass through. * */ /*PRINTFLIKE2*/ int ufs_fault(vnode_t *vp, char *fmt, ...) { va_list adx; int error; MINOR(("[ufs_fault")); va_start(adx, fmt); error = ufs_fault_v(vp, fmt, adx); va_end(adx); MINOR((": %s (%d)]\n", err_name(error), error)); return (error); } const char *nullfmt = ""; static int ufs_fault_v(vnode_t *vp, char *fmt, va_list adx) { ufs_failure_t *new = NULL; ufsvfs_t *ufsvfsp; triage_t fix; int err = ERESTART; int need_vfslock; MINOR(("[ufs_fault_v")); if (fmt == NULL) fmt = (char *)nullfmt; fix = triage(vp); if (vp) { ufsvfsp = (struct ufsvfs *)vp->v_vfsp->vfs_data; /* * Something bad has happened. That is why we are here. * * In order for the bad thing to be recorded in the superblock * we need to write to the superblock directly. * In the case that logging is enabled the logging code * would normally intercept our write as a delta to the log, * thus we mark the filesystem FSBAD in any case. */ need_vfslock = !MUTEX_HELD(&ufsvfsp->vfs_lock); if (need_vfslock) { mutex_enter(&ufsvfsp->vfs_lock); } ufsvfsp->vfs_fs->fs_clean = FSBAD; ASSERT(SEMA_HELD(&ufsvfsp->vfs_bufp->b_sem)); ufsvfsp->vfs_bufp->b_flags &= ~(B_ASYNC | B_READ | B_DONE | B_ERROR | B_DELWRI); (void) bdev_strategy(ufsvfsp->vfs_bufp); (void) biowait(ufsvfsp->vfs_bufp); if (need_vfslock) { mutex_exit(&ufsvfsp->vfs_lock); } } switch (fix) { default: case TRIAGE_DEAD: case TRIAGE_NO_SPIRIT: real_panic_v(new, fmt, adx); /* LINTED: warning: logical expression always true: op "||" */ ASSERT(DEBUG); err = EAGAIN; #if defined(DEBUG) if (!(DEBUG_FLAGS & DBGFLG_FIXWOULDPANIC)) { break; } /* FALLTHROUGH */ #else break; #endif /* DEBUG */ case TRIAGE_ATTEND_TO: /* q thread not running yet? */ mutex_enter(&ufs_fix.uq_mutex); if (!ufs_fix.uq_threadp) { mutex_exit(&ufs_fix.uq_mutex); ufs_thread_start(&ufs_fix, ufsfx_thread_fix_failures, NULL); ufs_fix.uq_threadp->t_flag |= T_DONTBLOCK; mutex_enter(&ufs_fix.uq_mutex); } else { MINOR((": fix failure thread already running ")); } if (ufs_fix.uq_threadp && ufs_fix.uq_threadp == curthread) { mutex_exit(&ufs_fix.uq_mutex); cmn_err(CE_WARN, "ufs_fault_v: recursive ufs_fault"); } else { mutex_exit(&ufs_fix.uq_mutex); } new = init_failure(vp, fmt, adx); if (new != NULL) { queue_failure(new); break; } real_panic_v(new, fmt, adx); break; } MINOR(("] ")); return (err); } /* * triage() * * Attempt to fix iff: * - the system is not already panicking * - this file system isn't explicitly marked not to be fixed * - we can connect to the user-level daemon * These conditions are detectable later, but if we can determine * them in the failing threads context the core dump may be more * useful. * */ static triage_t triage(vnode_t *vp) { struct inode *ip; int need_unlock_vfs; int fs_flags; MINUTE(("[triage")); if (panicstr) { MINUTE(( ": already panicking: \"%s\" => TRIAGE_DEAD]\n", panicstr)); return (TRIAGE_DEAD); } if (!vp || !(ip = VTOI(vp)) || !ip->i_ufsvfs) { MINUTE(( ": vp, ip or ufsvfs is NULL; can't determine fs => TRIAGE_DEAD]\n")); return (TRIAGE_DEAD); } /* use tryenter and continue no matter what since we're panicky */ need_unlock_vfs = !MUTEX_HELD(&ip->i_ufsvfs->vfs_lock); if (need_unlock_vfs) need_unlock_vfs = mutex_tryenter(&ip->i_ufsvfs->vfs_lock); fs_flags = ip->i_ufsvfs->vfs_fsfx.fx_flags; if (need_unlock_vfs) mutex_exit(&ip->i_ufsvfs->vfs_lock); if (fs_flags & UFSFX_PANIC) { MINUTE(( ": filesystem marked \"panic\" => TRIAGE_NO_SPIRIT]\n")); return (TRIAGE_NO_SPIRIT); } if (ufs_checkaccton(vp) != 0) { MINUTE(( ": filesystem would deadlock (accounting) => TRIAGE_DEAD]\n")); return (TRIAGE_DEAD); } if (ufs_checkswapon(vp) != 0) { MINUTE(( ": filesystem would deadlock (swapping) => TRIAGE_DEAD]\n")); return (TRIAGE_DEAD); } MINUTE((": return TRIAGE_ATTEND_TO] ")); return (TRIAGE_ATTEND_TO); } /* * init failure * * This routine allocates a failure struct and initializes * it's member elements. * Space is allocated for copies of dynamic identifying fs structures * passed in. Without a much more segmented kernel architecture * this is as protected as we can make it (for now.) */ static ufs_failure_t * init_failure(vnode_t *vp, char *fmt, va_list adx) { ufs_failure_t *new; struct inode *ip; int initialization_worked = 0; int need_vfs_unlock; MINOR(("[init_failure")); new = kmem_zalloc(sizeof (ufs_failure_t), KM_NOSLEEP); if (!new) { MINOR((": kmem_zalloc failed]\n")); return (NULL); } /* * enough information to make a fix attempt possible? */ if (!vp || !(ip = VTOI(vp)) || !ip->i_ufsvfs || !vp->v_vfsp || !ip->i_ufsvfs->vfs_bufp || !ITOF(ip) || !fmt) goto errout; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VBLK && vp->v_type != VCHR && vp->v_type != VLNK && vp->v_type != VFIFO && vp->v_type != VSOCK) goto errout; if (ip->i_ufsvfs->vfs_root->v_type != VREG && ip->i_ufsvfs->vfs_root->v_type != VDIR && ip->i_ufsvfs->vfs_root->v_type != VBLK && ip->i_ufsvfs->vfs_root->v_type != VCHR && ip->i_ufsvfs->vfs_root->v_type != VLNK && ip->i_ufsvfs->vfs_root->v_type != VFIFO && ip->i_ufsvfs->vfs_root->v_type != VSOCK) goto errout; if ((ITOF(ip)->fs_magic != FS_MAGIC) && (ITOF(ip)->fs_magic != MTB_UFS_MAGIC)) goto errout; /* intialize values */ (void) vsnprintf(new->uf_panic_str, LOCKFS_MAXCOMMENTLEN - 1, fmt, adx); new->uf_ufsvfsp = ip->i_ufsvfs; new->uf_vfsp = ip->i_vfs; mutex_init(&new->uf_mutex, NULL, MUTEX_DEFAULT, NULL); need_vfs_unlock = !MUTEX_HELD(&ip->i_ufsvfs->vfs_lock); if (need_vfs_unlock) { if (!mutex_tryenter(&ip->i_ufsvfs->vfs_lock)) { /* * not much alternative here, but we're panicking * already, it couldn't be worse - so just * proceed optimistically and take note. */ mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_lock_violations++; mutex_exit(&uf_stats.ufst_mutex); MINOR((": couldn't get vfs lock")) need_vfs_unlock = 0; } } if (mutex_tryenter(&new->uf_mutex)) { initialization_worked = set_state(new, UF_INIT); mutex_exit(&new->uf_mutex); } if (need_vfs_unlock) mutex_exit(&ip->i_ufsvfs->vfs_lock); if (initialization_worked) { MINOR(("] ")); return (new); } /* FALLTHROUGH */ errout: if (new) kmem_free(new, sizeof (ufs_failure_t)); MINOR((": failed]\n")); return (NULL); } static void queue_failure(ufs_failure_t *new) { MINOR(("[queue_failure")); mutex_enter(&ufs_fix.uq_mutex); if (ufs_fix.uq_ufhead) insque(new, &ufs_fix.uq_ufhead); else ufs_fix.uq_ufhead = new; if (mutex_tryenter(&new->uf_mutex)) { (void) set_state(new, UF_QUEUE); mutex_exit(&new->uf_mutex); } mutex_enter(&uf_stats.ufst_mutex); /* force wakeup */ ufs_fix.uq_ne = ufs_fix.uq_lowat = uf_stats.ufst_num_failed; mutex_exit(&uf_stats.ufst_mutex); cv_broadcast(&ufs_fix.uq_cv); DCALL(DBGLVL_MAJOR, cmn_err(CE_WARN, new->uf_panic_str? new->uf_panic_str: "queue_failure: NULL panic str?")); mutex_exit(&ufs_fix.uq_mutex); MINOR(("] ")); } /*PRINTFLIKE2*/ static void real_panic(ufs_failure_t *f, const char *fmt, ...) { va_list adx; MINUTE(("[real_panic ")); va_start(adx, fmt); real_panic_v(f, fmt, adx); va_end(adx); MINUTE((": return?!]\n")); } static void real_panic_v(ufs_failure_t *f, const char *fmt, va_list adx) { int seriousness = CE_PANIC; int need_unlock; MINUTE(("[real_panic_v ")); if (f && f->uf_ufsvfsp) TRANS_SETERROR(f->uf_ufsvfsp); #if defined(DEBUG) if (DEBUG_FLAGS & DBGFLG_NOPANIC) { seriousness = CE_WARN; cmn_err(CE_WARN, "real_panic: EWOULDPANIC\n"); } #endif /* DEBUG */ delay(hz >> 1); /* allow previous warnings to get out */ if (!f && fmt) vcmn_err(seriousness, fmt, adx); else cmn_err(seriousness, f && f->uf_panic_str? f->uf_panic_str: "real_panic: "); if (f) { need_unlock = !MUTEX_HELD(&f->uf_mutex); if (need_unlock) { mutex_enter(&f->uf_mutex); } f->uf_retry = -1; (void) set_state(f, UF_PANIC); if (need_unlock) { mutex_exit(&f->uf_mutex); } } MINUTE((": return?!]\n")); } /* * initializes ufs panic structs, locks, etc */ void ufsfx_init(void) { MINUTE(("[ufsfx_init")); /* patchable; unchanged while running, so no lock is needed */ ufsfx_tune.uft_too_long = UF_TOO_LONG; ufsfx_tune.uft_fixstart_period = UF_FIXSTART_PERIOD; ufsfx_tune.uft_fixpoll_period = UF_FIXPOLL_PERIOD; ufsfx_tune.uft_short_err_period = UF_SHORT_ERROR_PERIOD; ufsfx_tune.uft_long_err_period = UF_LONG_ERROR_PERIOD; uffsinfo.ufi_statp = &uf_stats; uffsinfo.ufi_tunep = &ufsfx_tune; uffsinfo.ufi_statetab = &state_desc[0]; mutex_init(&uf_stats.ufst_mutex, NULL, MUTEX_DEFAULT, NULL); ufs_thread_init(&ufs_fix, /* maxne */ 1); MINUTE(("] ")); } /* * initializes per-ufs values * returns 0 (ok) or errno */ int ufsfx_mount(struct ufsvfs *ufsvfsp, int flags) { MINUTE(("[ufsfx_mount (%d)", flags)); /* don't check/need vfs_lock because it's still being initialized */ ufsvfsp->vfs_fsfx.fx_flags = (flags & UFSMNT_ONERROR_FLGMASK) >> 4; MINUTE((": %s: fx_flags:%ld,", ufsvfsp->vfs_fs->fs_fsmnt, ufsvfsp->vfs_fsfx.fx_flags)); /* * onerror={panic ^ lock only ^ unmount} */ if (ufsvfsp->vfs_fsfx.fx_flags & UFSFX_PANIC) { MINUTE((" PANIC")); } else if (ufsvfsp->vfs_fsfx.fx_flags & UFSFX_LCKONLY) { MINUTE((" LCKONLY")); } else if (ufsvfsp->vfs_fsfx.fx_flags & UFSFX_LCKUMOUNT) { MINUTE((" LCKUMOUNT")); } else { ufsvfsp->vfs_fsfx.fx_flags = UFSFX_DEFAULT; ASSERT(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_FLGMASK >> 4)); MINUTE((" DEFAULT")); } pollwakeup(&ufs_pollhd, POLLPRI); MINUTE(("]\n")); return (0); } /* * ufsfx_unmount * * called during unmount */ void ufsfx_unmount(struct ufsvfs *ufsvfsp) { ufs_failure_t *f; int must_unlock_list; MINUTE(("[ufsfx_unmount")); if (!ufsvfsp) { MINUTE((": no ufsvfsp]")); return; } if ((must_unlock_list = !MUTEX_HELD(&ufs_fix.uq_mutex)) != 0) mutex_enter(&ufs_fix.uq_mutex); for (f = ufs_fix.uq_ufhead; f; f = f->uf_next) { int must_unlock_failure; must_unlock_failure = !MUTEX_HELD(&f->uf_mutex); if (must_unlock_failure) { mutex_enter(&f->uf_mutex); } if (f->uf_ufsvfsp == ufsvfsp) { /* * if we owned the failure record lock, then this * is probably a fix failure-triggered unmount, so * the warning is not appropriate or needed */ /* XXX if rebooting don't print this? */ if (!terminal_state(f->uf_s) && must_unlock_failure) { cmn_err(CE_WARN, "Unmounting %s while error-locked", fs_name(f)); } f->uf_ufsvfsp = NULL; f->uf_vfs_ufsfxp = NULL; f->uf_vfs_lockp = NULL; f->uf_bp = NULL; f->uf_vfsp = NULL; f->uf_retry = -1; } if (must_unlock_failure) mutex_exit(&f->uf_mutex); } if (must_unlock_list) mutex_exit(&ufs_fix.uq_mutex); pollwakeup(&ufs_pollhd, POLLPRI | POLLHUP); MINUTE(("] ")); } /* * ufsfx_(un)lockfs * * provides hook from lockfs code so we can recognize unlock/relock * This is called after it is certain that the (un)lock will succeed. */ void ufsfx_unlockfs(struct ufsvfs *ufsvfsp) { ufs_failure_t *f; int need_unlock; int need_unlock_list; int informed = 0; MINUTE(("[ufsfx_unlockfs")); if (!ufsvfsp) return; need_unlock_list = !MUTEX_HELD(&ufs_fix.uq_mutex); if (need_unlock_list) mutex_enter(&ufs_fix.uq_mutex); for (f = ufs_fix.uq_ufhead; f; f = f->uf_next) { need_unlock = !MUTEX_HELD(&f->uf_mutex); if (need_unlock) mutex_enter(&f->uf_mutex); if (f->uf_ufsvfsp == ufsvfsp && !terminal_state(f->uf_s)) { if (!(f->uf_s & UF_FIXING)) { /* * This might happen if we don't notice that * the fs gets marked FSFIX before it is * marked FSCLEAN, as might occur if the * the superblock was hammered directly. */ if (!informed) { informed = 1; cmn_err(CE_NOTE, "Unlock of %s succeeded before fs_clean marked FSFIX?", fs_name(f)); } /* * pass through fixing state so * transition protocol is satisfied */ if (!set_state(f, UF_FIXING)) { MINUTE((": failed] ")); } } if (!set_state(f, UF_FIXED)) { /* it's already fixed, so don't panic now */ MINUTE((": failed] ")); } } if (need_unlock) mutex_exit(&f->uf_mutex); } if (need_unlock_list) mutex_exit(&ufs_fix.uq_mutex); MINUTE(("] ")); } void ufsfx_lockfs(struct ufsvfs *ufsvfsp) { ufs_failure_t *f; int need_unlock; int need_unlock_list; MINUTE(("[ufsfx_lockfs")); if (!ufsvfsp) return; need_unlock_list = !MUTEX_HELD(&ufs_fix.uq_mutex); if (need_unlock_list) mutex_enter(&ufs_fix.uq_mutex); for (f = ufs_fix.uq_ufhead; f; f = f->uf_next) { need_unlock = !MUTEX_HELD(&f->uf_mutex); if (need_unlock) mutex_enter(&f->uf_mutex); if (f->uf_ufsvfsp == ufsvfsp && !terminal_state(f->uf_s) && f->uf_s != UF_PANIC) { switch (f->uf_s) { default: cmn_err(CE_WARN, "fs %s not in state UF_TRYLCK, UF_LOCKED or UF_FIXING", fs_name(f)); break; case UF_TRYLCK: if (!set_state(f, UF_LOCKED)) { MINUTE((": failed] ")); } break; case UF_LOCKED: if (!set_state(f, UF_FIXING)) { MINUTE((": failed] ")); } break; case UF_FIXING: break; } } if (need_unlock) mutex_exit(&f->uf_mutex); } if (need_unlock_list) mutex_exit(&ufs_fix.uq_mutex); MINUTE(("] ")); } /* * error lock, trigger fsck and unlock those fs with failures * blatantly copied from the hlock routine, although this routine * triggers differently in order to use uq_ne as meaningful data. */ /* ARGSUSED */ void ufsfx_thread_fix_failures(void *ignored) { int retry; callb_cpr_t cprinfo; CALLB_CPR_INIT(&cprinfo, &ufs_fix.uq_mutex, callb_generic_cpr, "ufsfixfail"); MINUTE(("[ufsfx_thread_fix_failures] ")); for (;;) { /* sleep until there is work to do */ mutex_enter(&ufs_fix.uq_mutex); (void) ufs_thread_run(&ufs_fix, &cprinfo); ufs_fix.uq_ne = 0; mutex_exit(&ufs_fix.uq_mutex); /* process failures on our q */ do { retry = ufsfx_do_failure_q(); if (retry) { mutex_enter(&ufs_fix.uq_mutex); CALLB_CPR_SAFE_BEGIN(&cprinfo); (void) cv_timedwait(&ufs_fix.uq_cv, &ufs_fix.uq_mutex, lbolt + (hz * retry)); CALLB_CPR_SAFE_END(&cprinfo, &ufs_fix.uq_mutex); mutex_exit(&ufs_fix.uq_mutex); } } while (retry); } /* NOTREACHED */ } /* * watch for fix-on-panic work * * returns # of seconds to sleep before trying again * and zero if no retry is needed */ int ufsfx_do_failure_q(void) { ufs_failure_t *f; long retry = 1; ufsd_t *s; MAJOR(("[ufsfx_do_failure_q")); DCALL(DBGLVL_HIDEOUS, dump_uf_list(NULL)); if (!mutex_tryenter(&ufs_fix.uq_mutex)) return (retry); retry = 0; rescan_q: /* * walk down failure list * depending on state of each failure, do whatever * is appropriate to move it to the next state * taking note of whether retry gets set * * retry protocol: * wakeup in shortest required time for any failure * retry == 0; nothing more to do (terminal state) * retry < 0; reprocess queue immediately, retry will * be abs(retry) for the next cycle * retry > 0; schedule wakeup for retry seconds */ for (f = ufs_fix.uq_ufhead; f; f = f->uf_next) { if (!mutex_tryenter(&f->uf_mutex)) { retry = 1; continue; } s = get_state_desc(f->uf_s); MINOR((": found%s: %s, \"%s: %s\"\n", s->ud_attr.terminal? " old": "", fs_name(f), state_name(f->uf_s), f->uf_panic_str)); if (s->ud_attr.terminal) { mutex_exit(&f->uf_mutex); continue; } if (s->ud_sfp) (*s->ud_sfp)(f, UFA_FOUND, f->uf_s); ASSERT(terminal_state(f->uf_s) || f->uf_retry != 0); if (f->uf_retry != 0) { if (retry > f->uf_retry || retry == 0) retry = f->uf_retry; if (f->uf_retry < 0) f->uf_retry = abs(f->uf_retry); } mutex_exit(&f->uf_mutex); } if (retry < 0) { retry = abs(retry); goto rescan_q; } mutex_exit(&ufs_fix.uq_mutex); DCALL(DBGLVL_HIDEOUS, dump_uf_list(NULL)); MAJOR((": retry=%ld, good night]\n\n", retry)); return (retry); } static void pester_msg(ufs_failure_t *f, int seriousness) { MINUTE(("[pester_msg")); ASSERT(f->uf_s & (UF_LOCKED | UF_FIXING)); /* * XXX if seems too long for this fs, poke administrator * XXX to run fsck manually (and change retry time?) */ cmn_err(seriousness, "Waiting for repair of %s to %s", fs_name(f), f->uf_s & UF_LOCKED? "start": "finish"); MINUTE(("]")); } static time_t trylock_time_exceeded(ufs_failure_t *f) { time_t toolong; extern time_t time; MINUTE(("[trylock_time_exceeded")); ASSERT(MUTEX_HELD(&f->uf_mutex)); toolong = (time_t)ufsfx_tune.uft_too_long + f->uf_entered_tm; if (time > toolong) cmn_err(CE_WARN, "error-lock timeout exceeded: %s", fs_name(f)); MINUTE(("] ")); return (time <= toolong? 0: time - toolong); } static int get_lockfs_status(ufs_failure_t *f, struct lockfs *lfp) { MINUTE(("[get_lockfs_status")); if (!f->uf_ufsvfsp) { MINUTE((": ufsvfsp is NULL]\n")); return (0); } ASSERT(MUTEX_HELD(&f->uf_mutex)); ASSERT(MUTEX_NOT_HELD(f->uf_vfs_lockp)); ASSERT(!vfs_lock_held(f->uf_vfsp)); ASSERT(f->uf_ufsvfsp->vfs_root != NULL); f->uf_lf_err = ufs_fiolfss(f->uf_ufsvfsp->vfs_root, lfp); if (f->uf_lf_err) { f->uf_retry = ufsfx_tune.uft_short_err_period; } MINUTE(("] ")); return (1); } static sfrc_t set_state(ufs_failure_t *f, ufs_failure_states_t new_state) { ufsd_t *s; sfrc_t sfrc = SFRC_FAIL; int need_unlock; extern time_t time; HIDEOUS(("[set_state: new state:%s", state_name(new_state))); ASSERT(f); ASSERT(MUTEX_HELD(&f->uf_mutex)); /* * if someone else is panicking, just let panic sync proceed */ if (panicstr) { (void) set_state(f, UF_NOTFIX); HIDEOUS((": state reset: not fixed] ")); return (sfrc); } /* * bad state transition, an internal error */ if (!state_trans_valid(f->uf_s, new_state)) { /* recursion */ if (!(f->uf_s & UF_PANIC) && !(new_state & UF_PANIC)) (void) set_state(f, UF_PANIC); MINOR((": state reset: transition failure (\"%s\"->\"%s\")] ", state_name(f->uf_s), state_name(new_state))); return (sfrc); } s = get_state_desc(new_state); need_unlock = !MUTEX_HELD(&ufs_fix.uq_mutex); if (need_unlock) mutex_enter(&ufs_fix.uq_mutex); if (s->ud_attr.at_fail && ufs_fix.uq_threadp && curthread == ufs_fix.uq_threadp) { cmn_err(CE_WARN, "set_state: probable recursive panic of %s", fs_name(f)); } if (need_unlock) mutex_exit(&ufs_fix.uq_mutex); /* NULL state functions always succeed */ sfrc = !s->ud_sfp? SFRC_SUCCESS: (*s->ud_sfp)(f, UFA_SET, new_state); if (sfrc == SFRC_SUCCESS && f->uf_s != new_state) { f->uf_s = new_state; f->uf_entered_tm = time; f->uf_counter = 0; } HIDEOUS(("]\n")); return (sfrc); } static ufsd_t * get_state_desc(ufs_failure_states_t state) { ufsd_t *s; HIDEOUS(("[get_state_desc")); for (s = &state_desc[1]; s->ud_name != NULL; s++) { if (s->ud_v == state) { HIDEOUS(("] ")); return (s); } } HIDEOUS(("] ")); return (&state_desc[0]); /* default */ } static sfrc_t sf_undef(ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { sfrc_t rc; TRIVIA(("[sf_undef, action is %s, state is %s\n", act_name(a), state_name(s))); ASSERT(s == UF_UNDEF); /* shouldn't find null failure records or ever set one */ rc = set_state(f, UF_NOTFIX); TRIVIA(("] ")); return (rc); } static sfrc_t sf_init( ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { sfrc_t rc = SFRC_FAIL; extern time_t time; TRIVIA(("[sf_init, action is %s", act_name(a))); ASSERT(s & UF_INIT); switch (a) { case UFA_SET: f->uf_begin_tm = time; f->uf_retry = 1; if (!f->uf_ufsvfsp) { (void) set_state(f, UF_PANIC); TRIVIA((": NULL ufsvfsp]\n")); return (rc); } /* * because we can call panic from many different levels, * we can't be sure that we've got the vfs_lock at this * point. However, there's not much alternative and if * we don't (have the lock) the worst case is we'll just * panic again */ f->uf_vfs_lockp = &f->uf_ufsvfsp->vfs_lock; f->uf_vfs_ufsfxp = &f->uf_ufsvfsp->vfs_fsfx; if (!f->uf_ufsvfsp->vfs_bufp) { (void) set_state(f, UF_PANIC); TRIVIA((": NULL vfs_bufp]\n")); return (rc); } f->uf_bp = f->uf_ufsvfsp->vfs_bufp; if (!f->uf_ufsvfsp->vfs_bufp->b_un.b_fs) { (void) set_state(f, UF_PANIC); TRIVIA((": NULL vfs_fs]\n")); return (rc); } /* vfs_fs = vfs_bufp->b_un.b_fs */ bcopy(f->uf_ufsvfsp->vfs_fs->fs_fsmnt, f->uf_fsname, MAXMNTLEN); f->uf_lf.lf_lock = LOCKFS_ELOCK; /* primer */ if (!f->uf_vfsp || f->uf_vfsp->vfs_dev == NODEV) { (void) set_state(f, UF_PANIC); TRIVIA((": NULL vfsp or vfs_dev == NODEV")); return (rc); } f->uf_dev = f->uf_vfsp->vfs_dev; rc = SFRC_SUCCESS; break; case UFA_FOUND: default: /* failures marked init shouldn't even be on the queue yet */ rc = set_state(f, UF_QUEUE); TRIVIA((": found failure with state init]\n")); } TRIVIA(("] ")); return (rc); } static sfrc_t sf_queue( ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_queue, action is %s", act_name(a))); ASSERT(s & UF_QUEUE); if (!f->uf_ufsvfsp) { TRIVIA((": NULL ufsvfsp]\n")); return (rc); } switch (a) { case UFA_FOUND: rc = sf_found_queue(f); break; case UFA_SET: ASSERT(MUTEX_HELD(&ufs_fix.uq_mutex)); mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_num_failed++; mutex_exit(&uf_stats.ufst_mutex); /* * if can't get the vfs lock, just wait until * UF_TRYLCK to set fx_current */ if (mutex_tryenter(f->uf_vfs_lockp)) { f->uf_vfs_ufsfxp->fx_current = f; mutex_exit(f->uf_vfs_lockp); } else { mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_current_races++; mutex_exit(&uf_stats.ufst_mutex); } f->uf_retry = 1; rc = SFRC_SUCCESS; TRIVIA(("] ")); break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); } return (rc); } static sfrc_t sf_found_queue(ufs_failure_t *f) { int replica; sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_found_queue")); /* * don't need to check for null ufsvfsp because * unmount must own list's ufs_fix.uq_mutex * to mark it null and we own that lock since * we got here. */ ASSERT(MUTEX_HELD(&ufs_fix.uq_mutex)); ASSERT(MUTEX_NOT_HELD(f->uf_vfs_lockp)); if (!mutex_tryenter(f->uf_vfs_lockp)) { TRIVIA((": tryenter(vfslockp) failed; retry]\n")); f->uf_retry = 1; return (rc); } replica = f->uf_vfs_ufsfxp && f->uf_vfs_ufsfxp->fx_current != NULL && f->uf_vfs_ufsfxp->fx_current != f && !terminal_state(f->uf_vfs_ufsfxp->fx_current->uf_s); /* * copy general flags to this ufs_failure so we don't * need to refer back to the ufsvfs, or, more importantly, * don't need to keep acquiring (trying to acquire) vfs_lockp * * The most restrictive option wins: * panic > errlock only > errlock+unmount > repair * XXX panic > elock > elock > elock+umount */ if (f->uf_vfs_ufsfxp->fx_flags & UFSFX_PANIC) { if (!set_state(f, UF_PANIC)) { TRIVIA((": marked panic but was queued?")); real_panic(f, " "); /*NOTREACHED*/ } mutex_exit(f->uf_vfs_lockp); return (rc); } f->uf_flags = f->uf_vfs_ufsfxp->fx_flags; if (replica) { if (!set_state(f, UF_REPLICA)) { f->uf_retry = 1; TRIVIA((": set to replica failed] ")); } else { TRIVIA(("] ")); } mutex_exit(f->uf_vfs_lockp); return (rc); } mutex_exit(f->uf_vfs_lockp); if (!set_state(f, UF_TRYLCK)) { TRIVIA((": failed] ")); } else { rc = SFRC_SUCCESS; } return (rc); } static sfrc_t sf_nonterm_cmn(ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_nonterm_cmn, action: %s, %s", act_name(a), state_name(s))); ASSERT(s & (UF_TRYLCK | UF_LOCKED | UF_UMOUNT | UF_FIXING)); ASSERT(!terminal_state(s)); if (!f->uf_ufsvfsp && !(f->uf_s & UF_UMOUNT)) { TRIVIA((": NULL ufsvfsp (state != UMOUNT)]\n")); (void) set_state(f, UF_NOTFIX); return (rc); } switch (a) { case UFA_SET: switch (s) { case UF_TRYLCK: ASSERT(MUTEX_NOT_HELD(f->uf_vfs_lockp)); rc = sf_set_trylck(f); break; case UF_LOCKED: rc = sf_set_locked(f); break; case UF_FIXING: f->uf_flags |= UFSFX_REPAIR_START; f->uf_retry = ufsfx_tune.uft_fixpoll_period; rc = SFRC_SUCCESS; break; case UF_UMOUNT: f->uf_retry = -ufsfx_tune.uft_short_err_period; rc = SFRC_SUCCESS; break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); } break; case UFA_FOUND: switch (s) { case UF_TRYLCK: rc = sf_found_trylck(f); break; case UF_LOCKED: case UF_FIXING: rc = sf_found_lock_fix_cmn(f, s); break; case UF_UMOUNT: rc = sf_found_umount(f); break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); break; } break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); break; } TRIVIA(("] ")); return (rc); } static sfrc_t sf_set_trylck(ufs_failure_t *f) { TRIVIA(("[sf_set_trylck")); if (!mutex_tryenter(f->uf_vfs_lockp)) { TRIVIA((": tryenter(vfslockp) failed; retry]\n")); f->uf_retry = 1; return (SFRC_FAIL); } if (!f->uf_vfs_ufsfxp->fx_current) f->uf_vfs_ufsfxp->fx_current = f; mutex_exit(f->uf_vfs_lockp); f->uf_lf.lf_flags = 0; f->uf_lf.lf_lock = LOCKFS_ELOCK; f->uf_retry = -ufsfx_tune.uft_fixstart_period; TRIVIA(("] ")); return (SFRC_SUCCESS); } static sfrc_t sf_found_trylck(ufs_failure_t *f) { struct lockfs lockfs_status; TRIVIA(("[sf_found_trylck")); if (trylock_time_exceeded(f) > 0) { (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); return (SFRC_FAIL); } if (!get_lockfs_status(f, &lockfs_status)) { (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); return (SFRC_FAIL); } if (f->uf_lf_err == NO_ERROR) f->uf_lf.lf_key = lockfs_status.lf_key; if (!set_lockfs(f, &lockfs_status)) { (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); return (SFRC_FAIL); } TRIVIA(("] ")); return (SFRC_SUCCESS); } static sfrc_t sf_set_locked(ufs_failure_t *f) { TRIVIA(("[sf_set_locked")); f->uf_retry = -ufsfx_tune.uft_fixstart_period; #if defined(DEBUG) if (f->uf_flags & UFSFX_REPAIR_START) TRIVIA(("clearing UFSFX_REPAIR_START ")); #endif /* DEBUG */ f->uf_flags &= ~UFSFX_REPAIR_START; if (f->uf_s & UF_TRYLCK) { cmn_err(CE_WARN, "Error-locked %s: \"%s\"", fs_name(f), f->uf_panic_str); if (f->uf_flags & UFSFX_LCKONLY) cmn_err(CE_WARN, "Manual repair of %s required", fs_name(f)); } /* * just reset to current state */ #if defined(DEBUG) TRIVIA(("locked->locked ")); #endif /* DEBUG */ TRIVIA(("] ")); return (SFRC_SUCCESS); } static sfrc_t sf_found_lock_fix_cmn(ufs_failure_t *f, ufs_failure_states_t s) { time_t toolong; extern time_t time; struct buf *bp = NULL; struct fs *dfs; time_t concerned, anxious; sfrc_t rc = SFRC_FAIL; ulong_t gb_size; TRIVIA(("[sf_found_lock_fix_cmn (\"%s\")", state_name(s))); if (s & UF_LOCKED) { ASSERT(MUTEX_HELD(&f->uf_mutex)); toolong = time > (ufsfx_tune.uft_too_long + f->uf_entered_tm); TRIVIA(("%stoolong", !toolong? "not": "")); HIDEOUS((": time:%ld, too long:%ld, entered_tm:%ld ", time, ufsfx_tune.uft_too_long, f->uf_entered_tm)); if (f->uf_flags & UFSFX_LCKUMOUNT) { if (set_state(f, UF_UMOUNT)) { TRIVIA(("] ")); rc = SFRC_SUCCESS; } else { TRIVIA((": failed] ")); f->uf_retry = 1; } return (rc); } if (!toolong) { rc = SFRC_SUCCESS; } else { if (!(f->uf_flags & UFSFX_REPAIR_START)) { cmn_err(CE_WARN, "%s repair of %s not started.", (f->uf_flags & UFSFX_LCKONLY)? "Manual": "Automatic", fs_name(f)); f->uf_retry = ufsfx_tune.uft_long_err_period; } else { f->uf_retry = ufsfx_tune.uft_long_err_period; cmn_err(CE_WARN, "Repair of %s is not timely; operator attention is required.", fs_name(f)); } TRIVIA(("] ")); return (rc); } } #if defined(DEBUG) else { ASSERT(s & UF_FIXING); } #endif /* DEBUG */ /* * get on disk superblock; force it to really * come from the disk */ (void) bfinval(f->uf_dev, 0); bp = UFS_BREAD(f->uf_ufsvfsp, f->uf_dev, SBLOCK, SBSIZE); if (bp) { bp->b_flags |= (B_STALE | B_AGE); dfs = bp->b_un.b_fs; } if (!bp || (bp->b_flags & B_ERROR) || ((dfs->fs_magic != FS_MAGIC) && (dfs->fs_magic != MTB_UFS_MAGIC))) { TRIVIA((": UFS_BREAD(SBLOCK) failed]\n")); f->uf_retry = 1; goto out; } /* fsck started but we haven't noticed yet? */ if (!(s & UF_FIXING) && dfs->fs_clean == FSFIX) { if (!set_state(f, UF_FIXING)) { TRIVIA((": failed]\n")); f->uf_retry = 1; goto out; } } /* fsck started but didn't succeed? */ if ((s & UF_FIXING) && ((dfs->fs_clean == FSBAD) || !fsck_active(f))) { TRIVIA((": fs_clean: %d", (int)dfs->fs_clean)); (void) set_state(f, UF_LOCKED); cmn_err(CE_WARN, "%s: Manual repair is necessary.", fs_name(f)); f->uf_retry = ufsfx_tune.uft_long_err_period; goto out; } gb_size = (dfs->fs_size * dfs->fs_bshift) / GB; toolong = (time_t)((gb_size == 0? 1: gb_size) * SecondsPerGig); /* fsck started but doesn't seem to be proceeding? */ if ((s & UF_FIXING) && dfs->fs_clean == FSFIX) { if (time > f->uf_entered_tm + toolong) { cmn_err(CE_WARN, "Repair completion timeout exceeded on %s; manual fsck may be required", fs_name(f)); f->uf_retry = ufsfx_tune.uft_long_err_period; } } concerned = f->uf_entered_tm + (toolong / 3); anxious = f->uf_entered_tm + ((2 * toolong) / 3); if (time > concerned) pester_msg(f, time > anxious? CE_WARN: CE_NOTE); TRIVIA(("] ")); out: if (bp) brelse(bp); return (rc); } static sfrc_t sf_found_umount(ufs_failure_t *f) { extern time_t time; sfrc_t rc = SFRC_FAIL; struct vfs *vfsp = f->uf_vfsp; struct ufsvfs *ufsvfsp = f->uf_ufsvfsp; int toolong = 0; int err = 0; TRIVIA(("[sf_found_umount")); toolong = time > ufsfx_tune.uft_too_long + f->uf_entered_tm; if (toolong) { TRIVIA((": unmount time limit exceeded] ")); goto out; } if (!vfsp || !ufsvfsp) { /* trivial case */ TRIVIA((": NULL vfsp and/or ufsvfsp, already unmounted?] ")); goto out; } if (!ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) { TRIVIA((": !not error locked?")); err = EINVAL; goto out; } /* The vn_vfsunlock will be done in dounmount() [.../common/fs/vfs.c] */ if (vn_vfslock(vfsp->vfs_vnodecovered)) { TRIVIA((": couldn't lock coveredvp")); err = EBUSY; goto out; } if ((err = dounmount(vfsp, 0, kcred)) != 0) { /* take note, but not many alternatives here */ mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_unmount_failures++; mutex_exit(&uf_stats.ufst_mutex); TRIVIA((": unmount failed] ")); } else { cmn_err(CE_NOTE, "unmounted error-locked %s", fs_name(f)); } out: if (toolong || (err != EBUSY && err != EAGAIN)) rc = set_state(f, UF_NOTFIX); TRIVIA(("] ")); return (rc); } static sfrc_t sf_term_cmn(ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { extern time_t time; sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_term_cmn, action is %s, state is %s", act_name(a), state_name(s))); ASSERT(s & (UF_FIXED | UF_NOTFIX | UF_REPLICA)); ASSERT(terminal_state(s)); if (!f->uf_ufsvfsp && !(f->uf_s & (UF_UMOUNT | UF_NOTFIX))) { TRIVIA((": NULL ufsvfsp (state != UMOUNT | NOTFIX)]\n")); return (rc); } switch (a) { case UFA_SET: switch (s) { case UF_NOTFIX: case UF_FIXED: { int need_lock_vfs; if (f->uf_ufsvfsp && f->uf_vfs_lockp) need_lock_vfs = !MUTEX_HELD(f->uf_vfs_lockp); else need_lock_vfs = 0; if (need_lock_vfs && !mutex_tryenter(f->uf_vfs_lockp)) { TRIVIA((": tryenter(vfslockp) fail; retry]\n")); f->uf_retry = 1; break; } f->uf_end_tm = time; f->uf_lf.lf_lock = LOCKFS_OLOCK; f->uf_retry = 0; if (f->uf_vfs_ufsfxp) f->uf_vfs_ufsfxp->fx_current = NULL; if (need_lock_vfs) mutex_exit(f->uf_vfs_lockp); cmn_err(CE_NOTE, (s & UF_NOTFIX)? "Could not fix %s": "%s is now accessible", fs_name(f)); if (s & UF_FIXED) { mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_num_fixed++; mutex_exit(&uf_stats.ufst_mutex); } (void) timeout(ufsfx_kill_fix_failure_thread, (void *)(ufsfx_tune.uft_short_err_period * hz), ufsfx_tune.uft_short_err_period * hz); rc = SFRC_SUCCESS; break; } case UF_REPLICA: ASSERT(MUTEX_HELD(f->uf_vfs_lockp)); /* not actually a replica? */ if (f->uf_vfs_ufsfxp && f->uf_vfs_ufsfxp->fx_current && f->uf_vfs_ufsfxp->fx_current != f && !terminal_state(f->uf_vfs_ufsfxp->fx_current->uf_s)) { f->uf_orig = f->uf_vfs_ufsfxp->fx_current; f->uf_retry = 0; rc = SFRC_SUCCESS; } else { TRIVIA((": NULL fx_current]\n")); f->uf_retry = 1; } break; default: rc = set_state(f, UF_PANIC); TRIVIA((": failed] ")); break; } break; case UFA_FOUND: /* * XXX de-allocate these after some period? * XXX or move to an historical list? * XXX or have an ioctl which reaps them? */ /* * For now, since we don't expect lots of failures * to occur (to the point of memory shortages), * just punt */ /* be sure we're not wasting cpu on old failures */ if (f->uf_retry != 0) { mutex_enter(&uf_stats.ufst_mutex); uf_stats.ufst_cpu_waste++; mutex_exit(&uf_stats.ufst_mutex); f->uf_retry = 0; } rc = SFRC_SUCCESS; break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); break; } TRIVIA(("] ")); return (rc); } static sfrc_t sf_panic( ufs_failure_t *f, ufsa_t a, ufs_failure_states_t s) { sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_panic, action is %s, prev. state is %s", act_name(a), state_name(f->uf_s))); ASSERT(s & UF_PANIC); switch (a) { case UFA_SET: f->uf_retry = -ufsfx_tune.uft_short_err_period; rc = SFRC_SUCCESS; break; case UFA_FOUND: default: real_panic(f, " "); /* LINTED: warning: logical expression always true: op "||" */ ASSERT(DEBUG); (void) set_state(f, UF_UMOUNT); /* XXX UF_NOTFIX? */ break; } TRIVIA(("] ")); return (rc); } /* * minimum state function */ static sfrc_t sf_minimum( ufs_failure_t *f, ufsa_t a, /* LINTED argument unused in function: ignored */ ufs_failure_states_t ignored) { sfrc_t rc = SFRC_FAIL; TRIVIA(("[sf_minimum, action is %s", act_name(a))); switch (a) { case UFA_SET: f->uf_retry = 0; /* FALLTHROUGH */ case UFA_FOUND: rc = SFRC_SUCCESS; break; default: (void) set_state(f, UF_PANIC); TRIVIA((": failed] ")); break; } TRIVIA(("] ")); return (rc); } static int state_trans_valid(ufs_failure_states_t from, ufs_failure_states_t to) { ufsd_t *s; int valid; HIDEOUS(("[state_trans_valid")); if (from & to) return (1); s = get_state_desc(to); /* * extra test is necessary since we want UF_UNDEF = 0, * (to detect freshly allocated memory) * but can't check for that value with a bit test */ valid = (to & UF_INIT)? from == s->ud_prev: from & s->ud_prev; HIDEOUS((": %svalid] ", valid? "": "in")); return (valid); } static int terminal_state(ufs_failure_states_t state) { ufsd_t *s; HIDEOUS(("[terminal_state")); s = get_state_desc(state); HIDEOUS((": %sterminal] ", s->ud_attr.terminal? "": "not ")); return ((int)s->ud_attr.terminal); } static void alloc_lockfs_comment(ufs_failure_t *f, struct lockfs *lfp) { MINUTE(("[alloc_lockfs_comment")); ASSERT(MUTEX_HELD(&f->uf_mutex)); /* * ufs_fiolfs expects a kmem_alloc'ed comment; * it frees the comment if the lock fails * or else when the lock is unlocked. */ f->uf_lf.lf_comment = kmem_zalloc(LOCKFS_MAXCOMMENTLEN, KM_NOSLEEP); if (f->uf_lf.lf_comment) { char *from; size_t len; /* * use panic string if there's no previous comment * or if we're setting the error lock */ if ((LOCKFS_IS_ELOCK(&f->uf_lf) || !lfp->lf_comment || lfp->lf_comlen <= 0)) { from = f->uf_panic_str; len = LOCKFS_MAXCOMMENTLEN; } else { from = lfp->lf_comment; len = lfp->lf_comlen; } bcopy(from, f->uf_lf.lf_comment, len); f->uf_lf.lf_comlen = len; } else { f->uf_lf.lf_comlen = 0; } MINUTE(("] ")); } static int set_lockfs(ufs_failure_t *f, struct lockfs *lfp) { int (*handle_lockfs_rc)(ufs_failure_t *); int rc; MINUTE(("[set_lockfs")); ASSERT(MUTEX_HELD(&f->uf_mutex)); ASSERT(!vfs_lock_held(f->uf_vfsp)); ASSERT(MUTEX_NOT_HELD(f->uf_vfs_lockp)); if (!f->uf_ufsvfsp) { MINUTE((": ufsvfsp is NULL]\n")); return (0); } ASSERT(MUTEX_NOT_HELD(&f->uf_ufsvfsp->vfs_ulockfs.ul_lock)); if (!f->uf_ufsvfsp->vfs_root) { MINUTE((": vfs_root is NULL]\n")); return (0); } alloc_lockfs_comment(f, lfp); f->uf_lf_err = 0; if (!LOCKFS_IS_ELOCK(lfp)) { lfp->lf_lock = f->uf_lf.lf_lock = LOCKFS_ELOCK; VN_HOLD(f->uf_ufsvfsp->vfs_root); f->uf_lf_err = ufs__fiolfs(f->uf_ufsvfsp->vfs_root, &f->uf_lf, /* from_user */ 0, /* from_log */ 0); VN_RELE(f->uf_ufsvfsp->vfs_root); } handle_lockfs_rc = f->uf_lf_err != 0? lockfs_failure: lockfs_success; rc = handle_lockfs_rc(f); MINUTE(("] ")); return (rc); } static int lockfs_failure(ufs_failure_t *f) { int error; ufs_failure_states_t s; TRIVIA(("[lockfs_failure")); ASSERT(MUTEX_HELD(&f->uf_mutex)); if (!f->uf_ufsvfsp) { TRIVIA((": ufsvfsp is NULL]\n")); return (0); } error = f->uf_lf_err; switch (error) { /* non-transient errors: */ case EACCES: /* disk/in-core metadata reconciliation failed */ case EPERM: /* inode reconciliation failed; incore inode changed? */ case EIO: /* device is hard-locked or not responding */ case EROFS: /* device is write-locked */ case EDEADLK: /* can't lockfs; deadlock would result; */ /* Swapping or saving accounting records */ /* onto this fs can cause this errno. */ MINOR(("ufs_fiolfs(\"%s\") of %s failed: %s (%d)", fs_name(f), lock_name(&f->uf_lf), err_name(error), error)); /* * if can't get lock, then fallback to panic, unless * unless unmount was requested (although unmount will * probably fail if the lock failed, so we'll panic * anyway */ s = ((f->uf_flags & UFSFX_LCKUMOUNT) && error != EDEADLK)? UF_UMOUNT: UF_PANIC; if (!set_state(f, s)) { real_panic(f, " "); /*NOTREACHED*/ break; } break; case EBUSY: case EAGAIN: f->uf_retry = ufsfx_tune.uft_short_err_period; if (curthread->t_flag & T_DONTPEND) { curthread->t_flag &= ~T_DONTPEND; } else if (!(f->uf_s & (UF_LOCKED | UF_FIXING))) { ufs_failure_states_t state; /* * if we didn't know that the fix had started, * take note */ state = error == EBUSY? UF_LOCKED: UF_FIXING; if (!set_state(f, state)) { TRIVIA((": failed] ")); return (0); } } break; default: /* some other non-fatal error */ MINOR(("lockfs(\"%s\") of %s returned %s (%d)", lock_name(&f->uf_lf), fs_name(f), err_name(f->uf_lf_err), f->uf_lf_err)); f->uf_retry = ufsfx_tune.uft_short_err_period; break; case EINVAL: /* unmounted? */ (void) set_state(f, UF_NOTFIX); break; } TRIVIA(("] ")); return (1); } static int lockfs_success(ufs_failure_t *f) { TRIVIA(("[lockfs_success")); ASSERT(MUTEX_HELD(&f->uf_mutex)); if (!f->uf_ufsvfsp) { TRIVIA((": ufsvfsp is NULL]\n")); return (0); } switch (f->uf_lf.lf_lock) { case LOCKFS_ELOCK: /* error lock worked */ if (!set_state(f, UF_LOCKED)) { TRIVIA((": failed] ")); return (0); } break; case LOCKFS_ULOCK: /* unlock worked */ /* * how'd we get here? * This should be done from fsck's unlock, * not from this thread's context. */ cmn_err(CE_WARN, "Unlocked error-lock of %s", fs_name(f)); ufsfx_unlockfs(f->uf_ufsvfsp); break; default: if (!set_state(f, UF_NOTFIX)) { TRIVIA((": failed] ")); return (0); } break; } TRIVIA(("] ")); return (1); } /* * when fsck is running it puts its pid into the lockfs * comment structure, prefaced by PIDSTR */ const char *PIDSTR = "[pid:"; static int fsck_active(ufs_failure_t *f) { char *cp; int i, found, errlocked; size_t comlen; const int PIDSTRLEN = (int)strlen(PIDSTR); struct ulockfs *ulp = &f->uf_ufsvfsp->vfs_ulockfs; TRIVIA(("[fsck_active")); ASSERT(f); ASSERT(f->uf_s & UF_FIXING); ASSERT(MUTEX_HELD(&f->uf_mutex)); ASSERT(f->uf_ufsvfsp); ASSERT(MUTEX_NOT_HELD(f->uf_vfs_lockp)); ASSERT(MUTEX_NOT_HELD(&ulp->ul_lock)); mutex_enter(&ulp->ul_lock); cp = ulp->ul_lockfs.lf_comment; comlen = ulp->ul_lockfs.lf_comlen; errlocked = (int)ULOCKFS_IS_ELOCK(ulp); mutex_exit(&ulp->ul_lock); if (!cp || comlen == 0) { TRIVIA((": null comment or comlen <= 0, found:0]")); return (0); } for (found = i = 0; !found && i < (comlen - PIDSTRLEN); i++, cp++) found = strncmp(cp, PIDSTR, PIDSTRLEN) == 0; TRIVIA(("found:%d, is_elock:%d]", found, errlocked)); return (errlocked & found); } static const char unknown_fs[] = ""; static const char null_failure[] = ""; static const char mutated_vfs_bufp[] = ""; static const char mutated_vfs_fs[] = ""; static char * fs_name(ufs_failure_t *f) { HIDEOUS(("[fs_name")); ASSERT(MUTEX_HELD(&f->uf_mutex)); if (!f) { HIDEOUS((": failure ptr is NULL]\n")); return ((char *)null_failure); } if (f->uf_fsname[0] != '\0') { HIDEOUS((": return (uf_fsname)]\n")); return (f->uf_fsname); } if (MUTEX_HELD(f->uf_vfs_lockp)) { if (f->uf_bp != f->uf_ufsvfsp->vfs_bufp) { HIDEOUS((": vfs_bufp mutated from 0x%p to 0x%p\n", (void *)f->uf_bp, (void *)f->uf_ufsvfsp->vfs_bufp)); return ((char *)mutated_vfs_bufp); } if (f->uf_fs != f->uf_ufsvfsp->vfs_fs) { HIDEOUS((": vfs_bufp mutated from 0x%p to 0x%p\n", (void *)f->uf_fs, (void *)f->uf_ufsvfsp->vfs_fs)); return ((char *)mutated_vfs_fs); } if (f->uf_ufsvfsp && f->uf_bp && f->uf_fs && *f->uf_fs->fs_fsmnt != '\0') { HIDEOUS((": return (fs_fsmnt)]\n")); return (f->uf_fs->fs_fsmnt); } } HIDEOUS((": unknown file system]\n")); return ((char *)unknown_fs); } #if defined(DEBUG) static char * lock_name(struct lockfs *lfp) { struct lock_description *l; char *lname; HIDEOUS(("[lock_name")); lname = lock_desc[0].ld_name; for (l = &lock_desc[1]; l->ld_name != NULL; l++) { if (lfp && lfp->lf_lock == l->ld_type) { lname = l->ld_name; break; } } HIDEOUS(("]")); return (lname); } static char * state_name(ufs_failure_states_t state) { ufsd_t *s; HIDEOUS(("[state_name")); s = get_state_desc(state); HIDEOUS(("]")); return (s->ud_name); } static char * err_name(int error) { struct error_description *e; HIDEOUS(("[err_name")); for (e = &err_desc[1]; e->ed_name != NULL; e++) { if (error == e->ed_errno) { HIDEOUS(("]")); return (e->ed_name); } } HIDEOUS(("]")); return (err_desc[0].ed_name); } static char * act_name(ufsa_t action) { struct action_description *a; HIDEOUS(("[act_name")); for (a = &act_desc[1]; a->ad_name != NULL; a++) { if (action == a->ad_v) { HIDEOUS(("]")); return (a->ad_name); } } HIDEOUS(("]")); return (act_desc[0].ad_name); } /* * dump failure list */ static void dump_uf_list(char *msg) { ufs_failure_t *f; int i; int list_was_locked = MUTEX_HELD(&ufs_fix.uq_mutex); if (!list_was_locked && !mutex_tryenter(&ufs_fix.uq_mutex)) { printf("dump_uf_list: couldn't get list lock\n"); return; } if (msg) { printf("\n%s", msg); } printf("\ndump_uf_list:\n\tuq_lowat: %d, uq_ne: %d\n", ufs_fix.uq_lowat, ufs_fix.uq_ne); mutex_enter(&uf_stats.ufst_mutex); printf("\tuf_stats.current_races: %ld\n", uf_stats.ufst_current_races); printf("\tuf_stats.num_failed: %ld\n", uf_stats.ufst_num_failed); printf("\tuf_stats.num_fixed: %ld\n", uf_stats.ufst_num_fixed); printf("\tuf_stats.cpu_waste: %ld\n", uf_stats.ufst_cpu_waste); printf("\tuf_stats.lock_violations: %ld, unmount_failures: %ld\n", uf_stats.ufst_lock_violations, uf_stats.ufst_unmount_failures); mutex_exit(&uf_stats.ufst_mutex); for (f = ufs_fix.uq_ufhead, i = 1; f; f = f->uf_next, i++) { if (!mutex_tryenter(&f->uf_mutex)) { printf("%d.\t\"skipped - try enter failed\"\n", i); continue; } dump_uf(f, i); mutex_exit(&f->uf_mutex); } printf("\n"); if (!list_was_locked) mutex_exit(&ufs_fix.uq_mutex); } static void dump_uf(ufs_failure_t *f, int i) { if (!f) { printf("dump_uf: NULL failure record\n"); return; } printf("%d.\t\"%s\" is %s.\n", i, fs_name(f), state_name(f->uf_s)); printf("\t\"%s\"\tAddr: 0x%p\n", f->uf_panic_str, (void *)f); printf("\tNext: 0x%p\t\tPrev: 0x%p\n", (void *)f->uf_next, (void *)f->uf_prev); if (f->uf_orig) printf("\tOriginal failure: 0x%p \"%s\"\n", (void *)f->uf_orig, f->uf_orig->uf_panic_str); printf("\tUfsvfs: 0x%p\t\tVfs_lockp: 0x%p\n", (void *)f->uf_ufsvfsp, (void *)f->uf_vfs_lockp); printf("\tVfs_fsfxp: 0x%p\n", (void *)f->uf_vfs_ufsfxp); printf("\tVfs_bufp: 0x%p", (void *)f->uf_bp); if (f->uf_bp) printf("\t\tVfs_fs: 0x%p\n", (void *)f->uf_fs); else printf("\n"); printf("\tBegin: 0x%lx\tEntered: 0x%lx\tEnd: 0x%lx\n", f->uf_begin_tm, f->uf_entered_tm, f->uf_end_tm); printf("\tFlags: (%d) %s%s%s%s", f->uf_flags, f->uf_flags & UFSFX_LCKONLY? "\"lock only\" " : "", f->uf_flags & UFSFX_LCKUMOUNT? "\"lock+unmount\" " : "", f->uf_flags & UFSFX_REPAIR_START? "\"started repair\" " : "", f->uf_flags == 0? "" : ""); printf("\tRetry: %ld seconds\n", f->uf_retry); printf("\tLockfs:\ttype: %s\terror: %s (%d)\n", lock_name(&f->uf_lf), err_name(f->uf_lf_err), f->uf_lf_err); } #endif /* DEBUG */ /* * returns # of ufs_failures in a non-terminal state on queue * used to coordinate with hlock thread (see ufs_thread.c) * and to determine when the error lock thread may exit */ int ufsfx_get_failure_qlen(void) { ufs_failure_t *f; ufsd_t *s; int qlen = 0; MINUTE(("[ufsfx_get_failure_qlen")); if (!mutex_tryenter(&ufs_fix.uq_mutex)) return (-1); /* * walk down failure list */ for (f = ufs_fix.uq_ufhead; f; f = f->uf_next) { if (!mutex_tryenter(&f->uf_mutex)) continue; s = get_state_desc(f->uf_s); if (s->ud_attr.terminal) { mutex_exit(&f->uf_mutex); continue; } MINUTE((": found: %s, \"%s: %s\"\n", fs_name(f), state_name(f->uf_s), f->uf_panic_str)); qlen++; mutex_exit(&f->uf_mutex); } mutex_exit(&ufs_fix.uq_mutex); MINUTE((": qlen=%d]\n", qlen)); return (qlen); } /* * timeout routine * called to shutdown fix failure thread and server daemon */ static void ufsfx_kill_fix_failure_thread(void *arg) { clock_t odelta = (clock_t)arg; int qlen; MAJOR(("[ufsfx_kill_fix_failure_thread")); qlen = ufsfx_get_failure_qlen(); if (qlen < 0) { clock_t delta; delta = odelta << 1; if (delta <= 0) delta = INT_MAX; (void) timeout(ufsfx_kill_fix_failure_thread, (void *)delta, delta); MAJOR((": rescheduled")); } else if (qlen == 0) { ufs_thread_exit(&ufs_fix); MAJOR((": killed")); } /* * else * let timeout expire */ MAJOR(("]\n")); }