Lines Matching full:recovery
5 * recovery stuff
127 /* Worker function used during recovery. */
164 * RECOVERY THREAD
169 /* wake the recovery thread in dlm_kick_recovery_thread()
171 * 1) sleeping with no recovery happening in dlm_kick_recovery_thread()
172 * 2) sleeping with recovery mastered elsewhere in dlm_kick_recovery_thread()
173 * 3) recovery mastered here, waiting on reco data */ in dlm_kick_recovery_thread()
178 /* Launch the recovery thread */
181 mlog(0, "starting dlm recovery thread...\n"); in dlm_launch_recovery_thread()
197 mlog(0, "waiting for dlm recovery thread to exit\n"); in dlm_complete_recovery_thread()
206 * this is lame, but here's how recovery works...
207 * 1) all recovery threads cluster wide will work on recovering
213 * 4) each of these locks should be locked until recovery is done
223 * everything and recovery for this dead node is done
233 mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", in dlm_print_reco_node_status()
301 mlog(0, "quitting DLM recovery thread\n"); in dlm_recovery_thread()
305 /* returns true when the recovery master has contacted us */
360 printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " in dlm_wait_for_node_recovery()
373 * block on the dlm->reco.event when recovery is in progress.
374 * the dlm recovery thread will set this state when it begins
377 * been marked with the RECOVERY flag */
391 mlog(0, "%s: reco thread %d in recovery: " in dlm_wait_for_recovery()
404 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", in dlm_begin_recovery()
415 printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); in dlm_end_recovery()
421 printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " in dlm_print_recovery_master()
435 mlog(0, "%s: no need do recovery after migrating all " in dlm_do_recovery()
461 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", in dlm_do_recovery()
472 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", in dlm_do_recovery()
489 * or recovery completes entirely. */ in dlm_do_recovery()
495 mlog(0, "another node will master this recovery session.\n"); in dlm_do_recovery()
502 * have been marked as in-recovery */ in dlm_do_recovery()
520 /* success! see if any other nodes need recovery */ in dlm_do_recovery()
521 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", in dlm_do_recovery()
543 /* we have become recovery master. there is no escaping in dlm_remaster_locks()
547 mlog(ML_ERROR, "%s: failed to alloc recovery area, " in dlm_remaster_locks()
573 /* node died, ignore it for recovery */ in dlm_remaster_locks()
589 "%d during recovery, retrying " in dlm_remaster_locks()
607 "recovery info for node %u\n", in dlm_remaster_locks()
614 mlog(0, "now receiving recovery data from " in dlm_remaster_locks()
619 mlog(0, "already receiving recovery data from " in dlm_remaster_locks()
624 mlog(0, "already DONE receiving recovery data " in dlm_remaster_locks()
643 mlog(0, "checking recovery state of node %u\n", in dlm_remaster_locks()
655 "requesting recovery info for " in dlm_remaster_locks()
684 /* Set this flag on recovery master to avoid in dlm_remaster_locks()
685 * a new recovery for another dead node start in dlm_remaster_locks()
686 * before the recovery is not done. That may in dlm_remaster_locks()
687 * cause recovery hung.*/ in dlm_remaster_locks()
704 mlog(0, "should be done with recovery!\n"); in dlm_remaster_locks()
706 mlog(0, "finishing recovery of %s at %lu, " in dlm_remaster_locks()
878 mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", in dlm_request_all_locks_worker()
883 /* worker could have been created before the recovery master in dlm_request_all_locks_worker()
886 mlog(ML_NOTICE, "%s: will not send recovery state, " in dlm_request_all_locks_worker()
887 "recovery master %u died, thread=(dead=%u,mas=%u)" in dlm_request_all_locks_worker()
904 * can safely move UNKNOWN lock resources for each recovery in dlm_request_all_locks_worker()
917 "recovery state for dead node %u, ret=%d\n", dlm->name, in dlm_request_all_locks_worker()
933 "recovery all-done for dead node %u, ret=%d\n", in dlm_request_all_locks_worker()
1005 * the line of recovery */ in dlm_reco_data_done_handler()
1011 "recovery data!\n", in dlm_reco_data_done_handler()
1021 /* wake the recovery thread, some node is done */ in dlm_reco_data_done_handler()
1026 mlog(ML_ERROR, "failed to find recovery node data for node " in dlm_reco_data_done_handler()
1043 /* always prune any $RECOVERY entries for dead nodes, in dlm_move_reco_locks_to_list()
1044 * otherwise hangs can occur during later recovery */ in dlm_move_reco_locks_to_list()
1051 "a $RECOVERY lock for dead " in dlm_move_reco_locks_to_list()
1068 "doing recovery for node %u. sending it.\n", in dlm_move_reco_locks_to_list()
1072 mlog(0, "found UNKNOWN owner while doing recovery " in dlm_move_reco_locks_to_list()
1119 orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", in dlm_send_mig_lockres_msg()
1133 "migration" : "recovery")); in dlm_send_mig_lockres_msg()
1308 send_to, flags & DLM_MRES_RECOVERY ? "recovery" : in dlm_send_one_lockres()
1325 flags & DLM_MRES_RECOVERY ? "recovery" : "migration", in dlm_send_one_lockres()
1334 * recovery data, and it will work on only one lockres.
1341 * we really cannot afford to fail an alloc in recovery
1381 "recovery" : "migration", mres->master); in dlm_mig_lockres_handler()
1393 * and RECOVERY flag changed when it completes. */ in dlm_mig_lockres_handler()
1490 * or when a lock is added by the recovery worker */ in dlm_mig_lockres_handler()
1495 mlog(0, "recovery has passed me a lockres with an " in dlm_mig_lockres_handler()
1974 * wrt lock queue ordering and recovery: in dlm_process_recovery_data()
2056 "Recovering res %s:%.*s, is already on recovery list!\n", in dlm_move_lockres_to_recovery_list()
2061 /* We need to hold a reference while on the recovery list */ in dlm_move_lockres_to_recovery_list()
2099 * recovery master. */ in dlm_move_lockres_to_recovery_list()
2125 /* removes all recovered locks from the recovery list.
2127 * unsets the RECOVERY flag and wakes waiters. */
2145 * the lock state sent during recovery */ in dlm_finish_local_lockres_recovery()
2183 * the lock state sent during recovery */ in dlm_finish_local_lockres_recovery()
2342 * dead node. once recovery finishes, the dlm thread in dlm_do_local_recovery_cleanup()
2349 /* always prune any $RECOVERY entries for dead nodes, in dlm_do_local_recovery_cleanup()
2350 * otherwise hangs can occur during later recovery */ in dlm_do_local_recovery_cleanup()
2357 "a $RECOVERY lock for dead " in dlm_do_local_recovery_cleanup()
2425 mlog(0, "%s: recovery master %d just died\n", in __dlm_hb_node_down()
2429 * the new_master and dead_node. that recovery in __dlm_hb_node_down()
2447 "another node likely did recovery already.\n", in __dlm_hb_node_down()
2519 mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", in dlm_reco_ast()
2525 mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", in dlm_reco_bast()
2530 mlog(0, "unlockast for recovery lock fired!\n"); in dlm_reco_unlock_ast()
2535 * dlmlock() on the special "$RECOVERY" lockres with the
2537 * this function on each node racing to become the recovery
2539 * a) this node gets the EX (and becomes the recovery master),
2542 * so each time a recovery master is needed, the entire cluster
2551 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", in dlm_pick_recovery_master()
2560 mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", in dlm_pick_recovery_master()
2571 "do the recovery\n", dlm->name, in dlm_pick_recovery_master()
2577 /* see if recovery was already finished elsewhere */ in dlm_pick_recovery_master()
2593 /* if this node has actually become the recovery master, in dlm_pick_recovery_master()
2594 * set the master and send the messages to begin recovery */ in dlm_pick_recovery_master()
2610 /* recovery lock is a special case. ast will not get fired, in dlm_pick_recovery_master()
2652 mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " in dlm_pick_recovery_master()
2661 mlog(ML_ERROR, "recovery lock not found\n"); in dlm_pick_recovery_master()
2709 /* node is down. not involved in recovery in dlm_send_begin_reco_message()
2722 mlog(0, "%s: trying to start recovery of node " in dlm_send_begin_reco_message()
2723 "%u, but node %u is waiting for last recovery " in dlm_send_begin_reco_message()
2743 mlog(ML_ERROR, "recovery lock not found\n"); in dlm_send_begin_reco_message()
2804 mlog(0, "recovery master %u sees %u as dead, but this " in dlm_begin_reco_handler()
2812 /* force the recovery cleanup in __dlm_hb_node_down in dlm_begin_reco_handler()
2822 mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", in dlm_begin_reco_handler()
2840 mlog(0, "finishing recovery for node %s:%u, " in dlm_send_finalize_reco_message()
2866 /* this has no effect on this recovery in dlm_send_finalize_reco_message()
2868 * finish out the last recovery */ in dlm_send_finalize_reco_message()
2870 "node finished recovery.\n", nodenum); in dlm_send_finalize_reco_message()
2901 mlog(0, "%s: node %u finalizing recovery stage%d of " in dlm_finalize_reco_handler()
2908 mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " in dlm_finalize_reco_handler()
2914 mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " in dlm_finalize_reco_handler()
2950 mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", in dlm_finalize_reco_handler()