1#!/usr/bin/ksh 2# CDDL HEADER START 3# 4# The contents of this file are subject to the terms of the 5# Common Development and Distribution License (the "License"). 6# You may not use this file except in compliance with the License. 7# 8# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9# or http://www.opensolaris.org/os/licensing. 10# See the License for the specific language governing permissions 11# and limitations under the License. 12# 13# When distributing Covered Code, include this CDDL HEADER in each 14# file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15# If applicable, add the following below this CDDL HEADER, with the 16# fields enclosed by brackets "[]" replaced with your own identifying 17# information: Portions Copyright [yyyy] [name of copyright owner] 18# 19# CDDL HEADER END 20# 21# 22# Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23# Use is subject to license terms. 24# 25# NWS DataServices within SunCluster reconfiguration script. 26# 27# Description: 28# 29# This script is called from /usr/cluster/lib/sc/run_reserve at 30# appropriate times to start and stop the NWS DataServices as SunCluster 31# disk device groups are brought online or taken offline. 32# 33# SNDR configuration requires that a resource group to be configured. 34# 1. The resource group name should be same as device group name with -stor-rg 35# added. e.g. if device group name is abc-dg then resource group name 36# would be abc-dg-stor-rg. 37# 2. It should have 2 resources in it, unless one of the resource types is the 38# SUNW.GeoCtlAVS. One of type SUNW.LogicalHostname and either SUNW.HAStorage 39# or SUNW.HAStoragePlus types. Resource type versioning is ignored. 40# HAStorage type resource, should have ServicePaths property set to 41# device group name. HAStoragePlus type resource, should have either the 42# FilesystemMountPoints pointing to a files system associated with the 43# device group name, or GlobalDevicePaths property set to device group name. 44# LogicalHostname type resource should have a failoverIP address in it and 45# it will be used by SNDR to communicate with the secondary side. 46# 47# As SNDR requires that the LogicalHost (failover) IP address which is a 48# part of resource group for SNDR, to be hosted on the same node where the 49# device group is, it tries to move the resource group also alongwith the 50# device group, in become_primary case of run_reserve script. While 51# in primary_to_secondary case, it will try to kill the switchover function 52# if it is still running in background, after stopping NWS data services. 53# 54# Usage: 55# 56# /usr/cluster/sbin/dscfg_reconfigure { start | stop } diskgroup 57# 58# Configuration: 59# 60# Scripts to be run should have been symlinked into $NWS_START_DIR and 61# $NWS_STOP_DIR. Note that the scripts are processed in lexical order, 62# and that unlike /etc/rc?.d/ there is no leading S or K character. 63# 64# Exit status: 65# 66# 0 - success 67# 1 - error 68# 69 70# 71# Global variables 72# 73 74# this program 75typeset -r ARGV0=$(basename $0) 76 77# directory full of start scripts 78typeset -r NWS_START_DIR=/usr/cluster/lib/dscfg/start 79 80# directory full of stop scripts 81typeset -r NWS_STOP_DIR=/usr/cluster/lib/dscfg/stop 82 83# the syslog facility to use. 84# - conceptually this should be based on the output of 85# "scha_cluster_get -O SYSLOG_FACILITY", but that won't work early 86# during boot. 87typeset -r SYSLOG_FACILITY=daemon 88 89PATH=$PATH:/usr/cluster/bin:/etc 90 91# Variables for retrying scswitch of Resource group for SNDR 92retry_num=12 93retry_interval=10 94rgname= 95rgstat= 96skip_resource=0 97count_LogicalHostname=0 98count_HAStoragePlus=0 99 100# Since the switchover of the resource group is called in background, 101# the stop action of the reconfig script will kill the background switchover 102# if it is running. Since we are stopping the NWS services on the node, there 103# is no need to switch the resource group, so it is killed. 104# The pid of the process is kept in file /var/run/scnws/$dg.pid. 105# Input: dg - device group 106# Output: Nothing, kills the process 107 108function kill_scswitch 109{ 110 dg=$1 111 if [ -f /var/run/scnws/$dg.pid ] 112 then 113 for i in `cat /var/run/scnws/$dg.pid` 114 do 115 pid=$i 116 kill -9 $pid 117 done 118 rm -f /var/run/scnws/$dg.pid 119 fi 120} 121 122# Get the status of the resource group on this node, using scha commands. 123# Input: resource group - $1 124# Output: Status 125 126function get_rgstat 127{ 128 rg=$1 129 rgstat=`scha_resourcegroup_get -O RG_STATE -G $rg` 130} 131 132# This function is called in background from do_scswitch function, to 133# switch the resource group to this node, which is becoming primary for 134# the diskgroup. If the status of resource group is Offline, it will use 135# scswitch command to switch the resource group to this node. If it has 136# become Online, cleanup pid file. If it is Pending, the resource group 137# is in the state of becoming online, so wait for sometime to become Online.. 138# scswitch may fail, so the function retries $retry_num times, waiting for 139# $retry_interval seconds. 140# Input: resource group - $1, Diskgroup/Diskset - $2 141# Output: 0 - success, 1 - failure 142 143function switchfunc 144{ 145 rg=$1 146 dg=$2 147 how_many=0 148 sleep 2 149 while [ $how_many != $retry_num ] 150 do 151 get_rgstat $rg 152 case "$rgstat" in 153 "ONLINE") 154 rm -f /var/run/scnws/$dg.pid 155 return 0 156 ;; 157 158 "OFFLINE") 159 logger -p ${SYSLOG_FACILITY}.notice \ 160 -t "NWS.[$ARGV0]" `gettext "scswitch of resource group"` "$rg" 161 162 scswitch -z -g $rg -h $(hostname) 163 retval=$? 164 if [ $retval != 0 ] 165 then 166 sleep $retry_interval 167 how_many=$(($how_many + 1)) 168 fi 169 ;; 170 171 "PENDING_ONLINE") 172 logger -p ${SYSLOG_FACILITY}.notice \ 173 -t "NWS.[$ARGV0]" `gettext "pending online of resource group"` "$rg" 174 sleep $retry_interval 175 how_many=$(($how_many + 1)) 176 ;; 177 178 *) 179 logger -p ${SYSLOG_FACILITY}.notice \ 180 -t "NWS.[$ARGV0]" `gettext "Improper resource group status for Remote Mirror"` "$rgstat" 181 rm -f /var/run/scnws/$dg.pid 182 return 1 183 ;; 184 esac 185 done 186 logger -p ${SYSLOG_FACILITY}.err \ 187 -t "NWS.[$ARGV0]" "Did not switch resource group for Remote Mirror. System Administrator intervention required" 188 rm -f /var/run/scnws/$dg.pid 189 return 1 190} 191 192 193# This function calls switchfunc function in background, to switch the 194# resource group for SNDR. It validates the diskgroup/diskset is configured 195# for SNDR, checks if the resource group is in Managed state etc. 196# If it detects a mis-configuration, it will disable SNDR for the 197# device group being processed. This is to prevent cluster hangs and panics. 198# 199# The ServicePaths extension property of HAStorage type resource or the 200# GlobalDevicePaths extension property of HAStoragePlus, both of which 201# specify the device group, serve as a link or mapping to retrieve the 202# resource group associated with the SNDR configured device group. 203# Switchfunc is called in the background to avoid the deadlock situation arising 204# out of switchover of resource group from within device group switchover. 205# 206# In run_reserve context, we are doing the device group switchover, trying to 207# bring it online on the node. Device group is not completely switched online, 208# until the calling script run_reserve returns. In the process, we are calling 209# the associated SNDR resource group switchover using scswitch command. 210# Resource group switchover will trigger the switchover of device group also. 211# 212# If resource group switchover is called in foreground, before the device 213# group has become online, then it will result in switching the device group 214# again, resulting in deadlock. Resource group can not become online until 215# the device group is online and the device group can not become online until the 216# script returns, causing this circular dependency resulting in deadlock. 217# 218# Calling the resource group switch in background allows current run_reserve 219# script to return immediately, allowing device group to become online. 220# If the device group is already online on the node, then the resource group 221# does not cause the device group switchover again. 222# 223# Input: Device group dg - $1 224# Output: 0 - success 225# 1 - either dg not applicable for SNDR or error 226# 2 - SNDR mis-configuration 227 228function do_scswitch 229{ 230 dg=$1 231 232 if [ ! -x /usr/cluster/bin/scha_resource_get \ 233 -o ! -x /usr/cluster/bin/scha_resourcegroup_get ] 234 then 235 return 1 236 fi 237 238# hard coded rg name from dg 239 rgname="$dg-stor-rg" 240 scha_resourcegroup_get -O rg_description -G $rgname > /dev/null 241 if [ $? != 0 ] 242 then 243# There is no device group configured in cluster for SNDR with this cluster tag 244 return 1 245 fi 246 247# Check the state of resource group 248 249 get_rgstat $rgname 250 if [ -z "$rgstat" \ 251 -o "$rgstat" = "UNMANAGED" -o "$rgstat" = "ERROR_STOP_FAILED" ] 252 then 253 logger -p ${SYSLOG_FACILITY}.notice \ 254 -t "NWS.[$ARGV0]" \ 255 `gettext "Improper Remote Mirror resource group state"` "$rgstat" 256 return 2 257 fi 258 259# Check whether resources are of proper type and they are enabled 260 261 rs_list=`scha_resourcegroup_get -O resource_list -G $rgname` 262 if [ -z "$rs_list" ] 263 then 264 logger -p ${SYSLOG_FACILITY}.notice \ 265 -t "NWS.[$ARGV0]" \ 266 `gettext "No resources in Remote Mirror resource group <$rgname>"` 267 return 2 268 fi 269 for rs in $rs_list 270 do 271 rs_type=`scha_resource_get -O type -R $rs -G $rgname | cut -d':' -f1` 272 case "$rs_type" in 273 SUNW.LogicalHostname) 274 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname` 275 if [ "$rs_enb" = "ENABLED" ] 276 then 277 count_LogicalHostname=$(($count_LogicalHostname + 1)) 278 fi 279 ;; 280 SUNW.HAStoragePlus) 281 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname` 282 if [ "$rs_enb" = "ENABLED" ] 283 then 284 count_HAStoragePlus=$(($count_HAStoragePlus + 1)) 285 fi 286 ;; 287 esac 288 done 289 if [ $count_LogicalHostname -lt 1 ] 290 then 291 logger -p ${SYSLOG_FACILITY}.notice \ 292 -t "NWS.[$ARGV0]" `gettext "Missing Enabled Logical Host in resource group <$rgname> for Remote Mirror"` 293 return 2 294 elif [ $count_LogicalHostname -gt 1 ] 295 then 296 logger -p ${SYSLOG_FACILITY}.notice \ 297 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled Logical Host in resource group <$rgname> for Remote Mirror"` 298 return 2 299 fi 300 301 if [ $count_HAStoragePlus -lt 1 ] 302 then 303 logger -p ${SYSLOG_FACILITY}.notice \ 304 -t "NWS.[$ARGV0]" `gettext "Missing Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"` 305 return 2 306 elif [ $count_HAStoragePlus -gt 1 ] 307 then 308 logger -p ${SYSLOG_FACILITY}.notice \ 309 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"` 310 return 2 311 fi 312 313# Invoke switchfunc to switch the resource group. 314 315 switchfunc $rgname $dg & 316 pid=$! 317 mkdir -p /var/run/scnws/ 318 rm -f /var/run/scnws/$dg.pid 319 echo $pid > /var/run/scnws/$dg.pid 320 321 return 0 322} 323 324 325# 326# Functions 327# 328 329usage() 330{ 331 logger -p ${SYSLOG_FACILITY}.err \ 332 -t "NWS.[$ARGV0]" "usage: $ARGV0 { start | stop } diskgroup" 333 exit 1 334} 335 336 337# Input: arg1) $NWS_START_DIR - location of NWS scripts 338# arg2) start / stop 339# arg3 ) device group - $2 340# arg4) sndr_ena / sndr_dis 341# Output: Nothing. Log error if seen 342 343process_dir() 344{ 345 typeset dir=$1 346 typeset arg1=$2 347 typeset dg=$3 348 typeset arg2=$4 349 typeset RDC=$dir/10rdc 350 351 if [[ -d $dir ]] 352 then 353 for f in $dir/* 354 do 355 # process scripts in the directories in lexical order 356 # note - no leading S or K unlike /etc/rc?.d/ 357 358 if [ -s $f ] && [ $arg2 != "sndr_dis" ] 359 then 360 # run script and pipe output through 361 # logger into syslog 362 363 /usr/bin/ksh $f $arg1 $dg 2>&1 | 364 logger -p ${SYSLOG_FACILITY}.notice \ 365 -t "NWS.[${ARGV0}:$(basename $f)]" 366 else 367 # SNDR misconfigured - prevent start 368 if [ -s $f ] && [ $f != $RDC ] 369 then 370 # run script and pipe output through 371 # logger into syslog 372 /usr/bin/ksh $f $arg1 $dg 2>&1 | 373 logger -p ${SYSLOG_FACILITY}.notice \ 374 -t "NWS.[${ARGV0}:$(basename $f)]" 375 fi 376 fi 377 done 378 else 379 logger -p ${SYSLOG_FACILITY}.err \ 380 -t "NWS.[$ARGV0]" "no directory: $dir" 381 fi 382} 383 384 385# 386# main 387# 388 389if [ $# -ne 2 ] 390then 391 usage 392 # not reached 393fi 394 395 396case "$1" in 397start) 398 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "starting: $ARGV0 $*" 399 do_scswitch $2 400 retval=$? 401 if [ $retval == 2 ] 402 then 403 logger -p ${SYSLOG_FACILITY}.err \ 404 -t "NWS.[$ARGV0]" "**FATAL ERROR** Remote Mirror is mis-configured and DISABLED for devicegroup <"$2"> " 405 # Disable SNDR 406 process_dir $NWS_START_DIR start "$2" sndr_dis 407 else 408 process_dir $NWS_START_DIR start "$2" sndr_ena 409 fi 410 ;; 411stop) 412 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "stopping: $ARGV0 $*" 413 process_dir $NWS_STOP_DIR stop "$2" sndr_ena 414 kill_scswitch $2 415 ;; 416 417*) 418 usage 419 # not reached 420 ;; 421esac 422 423logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "completed: $ARGV0 $*" 424 425exit 0 426