xref: /titanic_52/usr/src/cmd/avs/sdbc/etc/dscfg_reconfigure.cluster.sh (revision 9c9af2590af49bb395bc8d2eace0f2d4ea16d165)
1#!/usr/bin/ksh
2# CDDL HEADER START
3#
4# The contents of this file are subject to the terms of the
5# Common Development and Distribution License (the "License").
6# You may not use this file except in compliance with the License.
7#
8# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9# or http://www.opensolaris.org/os/licensing.
10# See the License for the specific language governing permissions
11# and limitations under the License.
12#
13# When distributing Covered Code, include this CDDL HEADER in each
14# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15# If applicable, add the following below this CDDL HEADER, with the
16# fields enclosed by brackets "[]" replaced with your own identifying
17# information: Portions Copyright [yyyy] [name of copyright owner]
18#
19# CDDL HEADER END
20#
21#
22# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23# Use is subject to license terms.
24#
25# NWS DataServices within SunCluster reconfiguration script.
26#
27# Description:
28#
29# This script is called from /usr/cluster/lib/sc/run_reserve at
30# appropriate times to start and stop the NWS DataServices as SunCluster
31# disk device groups are brought online or taken offline.
32#
33# SNDR configuration requires that a resource group to be configured.
34# 1. The resource group name should be same as device group name with -stor-rg
35#    added. e.g. if device group name is abc-dg then resource group name
36#    would be abc-dg-stor-rg.
37# 2. It should have 2 resources in it, unless one of the resource types is the
38#    SUNW.GeoCtlAVS. One of type SUNW.LogicalHostname and either SUNW.HAStorage
39#    or SUNW.HAStoragePlus types. Resource type versioning is ignored.
40#    HAStorage type resource, should have ServicePaths property set to
41#    device group name. HAStoragePlus type resource, should have either the
42#    FilesystemMountPoints pointing to a files system associated with the
43#    device group name, or GlobalDevicePaths property set to device group name.
44#    LogicalHostname type resource should have a failoverIP address in it and
45#    it will be used by SNDR to communicate with the secondary side.
46#
47# As SNDR requires that the LogicalHost (failover) IP address which is a
48# part of resource group for SNDR, to be hosted on the same node where the
49# device group is, it tries to move the resource group also alongwith the
50# device group, in become_primary case of run_reserve script. While
51# in primary_to_secondary case, it will try to kill the switchover function
52# if it is still running in background, after stopping NWS data services.
53#
54# Usage:
55#
56# /usr/cluster/sbin/dscfg_reconfigure { start | stop } diskgroup
57#
58# Configuration:
59#
60# Scripts to be run should have been symlinked into $NWS_START_DIR and
61# $NWS_STOP_DIR.  Note that the scripts are processed in lexical order,
62# and that unlike /etc/rc?.d/ there is no leading S or K character.
63#
64# Exit status:
65#
66# 0 - success
67# 1 - error
68#
69
70#
71# Global variables
72#
73
74# this program
75typeset -r ARGV0=$(basename $0)
76
77# directory full of start scripts
78typeset -r NWS_START_DIR=/usr/cluster/lib/dscfg/start
79
80# directory full of stop scripts
81typeset -r NWS_STOP_DIR=/usr/cluster/lib/dscfg/stop
82
83# the syslog facility to use.
84# - conceptually this should be based on the output of
85#   "scha_cluster_get -O SYSLOG_FACILITY", but that won't work early
86#   during boot.
87typeset -r SYSLOG_FACILITY=daemon
88
89PATH=$PATH:/usr/cluster/bin:/etc
90
91# Variables for retrying scswitch of Resource group for SNDR
92retry_num=12
93retry_interval=10
94rgname=
95rgstat=
96skip_resource=0
97count_LogicalHostname=0
98count_HAStoragePlus=0
99
100# Since the switchover of the resource group is called in background,
101# the stop action of the reconfig script will kill the background switchover
102# if it is running. Since we are stopping the NWS services on the node, there
103# is no need to switch the resource group, so  it is killed.
104# The pid of the process is kept in file /var/run/scnws/$dg.pid.
105# Input:  dg - device group
106# Output: Nothing, kills the process
107
108function kill_scswitch
109{
110        dg=$1
111        if [ -f /var/run/scnws/$dg.pid ]
112        then
113                for i in `cat /var/run/scnws/$dg.pid`
114                do
115                        pid=$i
116                        kill -9 $pid
117                done
118                rm -f /var/run/scnws/$dg.pid
119        fi
120}
121
122# Get the status of the resource group on this node, using scha commands.
123# Input: resource group - $1
124# Output: Status
125
126function get_rgstat
127{
128	rg=$1
129	rgstat=`scha_resourcegroup_get -O RG_STATE -G $rg`
130}
131
132# This function is called in background from do_scswitch function, to
133# switch the resource group to this node, which is becoming primary for
134# the diskgroup. If the status of resource group is Offline, it will use
135# scswitch command to switch the resource group to this node. If it has
136# become Online, cleanup pid file. If it is Pending, the resource group
137# is in the state of becoming online, so wait for sometime to become Online..
138# scswitch may fail, so the function retries $retry_num times, waiting for
139# $retry_interval seconds.
140# Input: resource group - $1, Diskgroup/Diskset - $2
141# Output: 0 - success, 1 - failure
142
143function switchfunc
144{
145        rg=$1
146        dg=$2
147	how_many=0
148	sleep 2
149	while [ $how_many != $retry_num ]
150	do
151		get_rgstat $rg
152		case "$rgstat" in
153		"ONLINE")
154		 	rm -f /var/run/scnws/$dg.pid
155			return 0
156			;;
157
158		"OFFLINE")
159			logger -p ${SYSLOG_FACILITY}.notice \
160			-t "NWS.[$ARGV0]" `gettext "scswitch of resource group"` "$rg"
161
162			scswitch -z -g $rg -h $(hostname)
163			retval=$?
164			if [ $retval != 0 ]
165			then
166				sleep $retry_interval
167				how_many=$(($how_many + 1))
168			fi
169			;;
170
171		"PENDING_ONLINE")
172			logger -p ${SYSLOG_FACILITY}.notice \
173			-t "NWS.[$ARGV0]" `gettext "pending online of resource group"` "$rg"
174			sleep $retry_interval
175			how_many=$(($how_many + 1))
176			;;
177
178		*)
179			logger -p ${SYSLOG_FACILITY}.notice \
180			-t "NWS.[$ARGV0]" `gettext "Improper resource group status for Remote Mirror"` "$rgstat"
181		 	rm -f /var/run/scnws/$dg.pid
182			return 1
183			;;
184		esac
185	done
186	logger -p ${SYSLOG_FACILITY}.err \
187	-t "NWS.[$ARGV0]" "Did not switch resource group for Remote Mirror. System Administrator intervention required"
188 	rm -f /var/run/scnws/$dg.pid
189	return 1
190}
191
192
193# This function calls switchfunc function in background, to switch the
194# resource group for SNDR. It validates the diskgroup/diskset is configured
195# for SNDR, checks if the resource group is in Managed state etc.
196# If it detects a mis-configuration, it will disable SNDR for the
197# device group being processed. This is to prevent cluster hangs and panics.
198#
199# The ServicePaths extension property of HAStorage type resource or the
200# GlobalDevicePaths extension property of HAStoragePlus, both of which
201# specify the device group, serve as a link or mapping to retrieve the
202# resource group associated with the SNDR configured device group.
203# Switchfunc is called in the background to avoid the deadlock situation arising
204# out of switchover of resource group from within device group switchover.
205#
206# In run_reserve context, we are doing the device group switchover, trying to
207# bring it online on the node. Device group is not completely switched online,
208# until the calling script run_reserve returns. In the process, we are calling
209# the associated SNDR resource group switchover using scswitch command.
210# Resource group switchover will trigger the switchover of device group also.
211#
212# If resource group switchover is called in foreground, before the device
213# group has become online, then it will result in switching the device group
214# again, resulting in deadlock. Resource group can not become online until
215# the device group is online and the device group can not become online until the
216# script returns, causing this circular dependency resulting in deadlock.
217#
218# Calling the resource group switch in background allows current run_reserve
219# script to return immediately, allowing device group to become online.
220# If the device group is already online on the node, then the resource group
221# does not cause the device group switchover again.
222#
223# Input: Device group dg - $1
224# Output: 0 - success
225#	  1 - either dg not applicable for SNDR or error
226#	  2 - SNDR mis-configuration
227
228function do_scswitch
229{
230	dg=$1
231
232        if [ ! -x /usr/cluster/bin/scha_resource_get \
233		-o ! -x /usr/cluster/bin/scha_resourcegroup_get ]
234        then
235                return 1
236        fi
237
238# hard coded rg name from dg
239	rgname="$dg-stor-rg"
240	scha_resourcegroup_get -O rg_description -G $rgname > /dev/null
241	if [ $? != 0 ]
242	then
243# There is no device group configured in cluster for SNDR with this cluster tag
244		return 1
245	fi
246
247# Check the state of resource group
248
249	get_rgstat $rgname
250	if [ -z "$rgstat" \
251		-o "$rgstat" = "UNMANAGED" -o "$rgstat" = "ERROR_STOP_FAILED" ]
252	then
253		logger -p ${SYSLOG_FACILITY}.notice \
254		-t "NWS.[$ARGV0]" \
255		`gettext "Improper Remote Mirror resource group state"` "$rgstat"
256        	return 2
257	fi
258
259# Check whether resources are of proper type and they are enabled
260
261	rs_list=`scha_resourcegroup_get -O resource_list -G $rgname`
262	if [ -z "$rs_list" ]
263	then
264		logger -p ${SYSLOG_FACILITY}.notice \
265		-t "NWS.[$ARGV0]" \
266		`gettext "No resources in Remote Mirror resource group <$rgname>"`
267		return 2
268	fi
269	for rs in $rs_list
270	do
271		rs_type=`scha_resource_get -O type -R $rs -G $rgname  | cut -d':' -f1`
272		case "$rs_type" in
273		SUNW.LogicalHostname)
274			rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
275			if [ "$rs_enb" = "ENABLED" ]
276			then
277			count_LogicalHostname=$(($count_LogicalHostname + 1))
278			fi
279			;;
280		SUNW.HAStoragePlus)
281			rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
282			if [ "$rs_enb" = "ENABLED" ]
283			then
284			count_HAStoragePlus=$(($count_HAStoragePlus + 1))
285			fi
286			;;
287		esac
288	done
289	if [ $count_LogicalHostname -lt 1 ]
290	then
291		logger -p ${SYSLOG_FACILITY}.notice \
292		-t "NWS.[$ARGV0]" `gettext "Missing Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
293		return 2
294	elif [ $count_LogicalHostname -gt 1 ]
295        then
296		logger -p ${SYSLOG_FACILITY}.notice \
297		-t "NWS.[$ARGV0]" `gettext "Too Many Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
298		return 2
299	fi
300
301	if [ $count_HAStoragePlus -lt 1 ]
302	then
303		logger -p ${SYSLOG_FACILITY}.notice \
304		-t "NWS.[$ARGV0]" `gettext "Missing Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
305		return 2
306	elif [ $count_HAStoragePlus -gt 1 ]
307	then
308		logger -p ${SYSLOG_FACILITY}.notice \
309		-t "NWS.[$ARGV0]" `gettext "Too Many Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
310		return 2
311	fi
312
313# Invoke switchfunc to switch the resource group.
314
315	switchfunc $rgname $dg &
316	pid=$!
317	mkdir -p /var/run/scnws/
318	rm -f /var/run/scnws/$dg.pid
319	echo $pid > /var/run/scnws/$dg.pid
320
321	return 0
322}
323
324
325#
326# Functions
327#
328
329usage()
330{
331	logger -p ${SYSLOG_FACILITY}.err \
332	    -t "NWS.[$ARGV0]" "usage: $ARGV0 { start | stop } diskgroup"
333	exit 1
334}
335
336
337# Input: arg1) $NWS_START_DIR - location of NWS scripts
338#	 arg2) start / stop
339#	 arg3 ) device group - $2
340#	 arg4) sndr_ena / sndr_dis
341# Output: Nothing. Log error if seen
342
343process_dir()
344{
345	typeset dir=$1
346	typeset arg1=$2
347	typeset dg=$3
348	typeset arg2=$4
349	typeset RDC=$dir/10rdc
350
351	if [[ -d $dir ]]
352	then
353		for f in $dir/*
354		do
355			# process scripts in the directories in lexical order
356			# note - no leading S or K unlike /etc/rc?.d/
357
358			if [ -s $f ] && [ $arg2 != "sndr_dis" ]
359			then
360				# run script and pipe output through
361				# logger into syslog
362
363				/usr/bin/ksh $f $arg1 $dg 2>&1 |
364				    logger -p ${SYSLOG_FACILITY}.notice \
365					-t "NWS.[${ARGV0}:$(basename $f)]"
366			else
367			# SNDR misconfigured - prevent start
368                            if [ -s $f ] && [ $f != $RDC ]
369                            then
370                                # run script and pipe output through
371                                # logger into syslog
372                                /usr/bin/ksh $f $arg1 $dg 2>&1 |
373                                    logger -p ${SYSLOG_FACILITY}.notice \
374                                        -t "NWS.[${ARGV0}:$(basename $f)]"
375			    fi
376			fi
377		done
378	else
379		logger -p ${SYSLOG_FACILITY}.err \
380		    -t "NWS.[$ARGV0]" "no directory: $dir"
381	fi
382}
383
384
385#
386# main
387#
388
389if [ $# -ne 2 ]
390then
391	usage
392	# not reached
393fi
394
395
396case "$1" in
397start)
398	logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "starting: $ARGV0 $*"
399	do_scswitch $2
400	retval=$?
401	if [ $retval == 2 ]
402	then
403		logger -p ${SYSLOG_FACILITY}.err \
404		    -t "NWS.[$ARGV0]" "**FATAL ERROR** Remote Mirror is mis-configured and DISABLED for devicegroup <"$2"> "
405		# Disable SNDR
406		process_dir $NWS_START_DIR start "$2" sndr_dis
407	else
408		process_dir $NWS_START_DIR start "$2" sndr_ena
409	fi
410	;;
411stop)
412	logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "stopping: $ARGV0 $*"
413	process_dir $NWS_STOP_DIR stop "$2" sndr_ena
414	kill_scswitch $2
415	;;
416
417*)
418	usage
419	# not reached
420	;;
421esac
422
423logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "completed: $ARGV0 $*"
424
425exit 0
426