1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <unistd.h> 27 #include <sys/types.h> 28 #include <sys/stat.h> 29 #include <sys/statvfs.h> 30 #include <sys/uadmin.h> 31 #include <sys/resource.h> 32 #include <fcntl.h> 33 #include <stdio.h> 34 #include <thread.h> 35 #include <meta.h> 36 #include <sdssc.h> 37 #include <mdmn_changelog.h> 38 #include "mdmn_subr.h" 39 40 /* 41 * This is the communication daemon for SVM Multi Node Disksets. 42 * It runs on every node and provides the following rpc services: 43 * - mdmn_send_svc_2 44 * - mdmn_work_svc_2 45 * - mdmn_wakeup_initiator_svc_2 46 * - mdmn_wakeup_master_svc_2 47 * - mdmn_comm_lock_svc_2 48 * - mdmn_comm_unlock_svc_2 49 * - mdmn_comm_suspend_svc_2 50 * - mdmn_comm_resume_svc_2 51 * - mdmn_comm_reinit_set_svc_2 52 * where send, lock, unlock and reinit are meant for external use, 53 * work and the two wakeups are for internal use only. 54 * 55 * NOTE: 56 * On every node only one of those xxx_2 functions can be active at the 57 * same time because the daemon is single threaded. 58 * 59 * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s 60 * as part of their handlers, so those aspects are multi-threaded) 61 * 62 * In case an event occurs that has to be propagated to all the nodes... 63 * 64 * One node (the initiator) 65 * calls the libmeta function mdmn_send_message() 66 * This function calls the local daemon thru mdmn_send_svc_2. 67 * 68 * On the initiator: 69 * mdmn_send_svc_2() 70 * - starts a thread -> mdmn_send_to_work() and returns. 71 * mdmn_send_to_work() 72 * - sends this message over to the master of the diskset. 73 * This is done by calling mdmn_work_svc_2 on the master. 74 * - registers to the initiator_table 75 * - exits without doing a svc_sendreply() for the call to 76 * mdmn_send_svc_2. This means that call is blocked until somebody 77 * (see end of this comment) does a svc_sendreply(). 78 * This means mdmn_send_message() does not yet return. 79 * - A timeout surveillance is started at this point. 80 * This means in case the master doesn't reply at all in an 81 * aproppriate time, an error condition is returned 82 * to the caller. 83 * 84 * On the master: 85 * mdmn_work_svc_2() 86 * - starts a thread -> mdmn_master_process_msg() and returns 87 * mdmn_master_process_msg() 88 * - logs the message to the change log 89 * - executes the message locally 90 * - flags the message in the change log 91 * - sends the message to mdmn_work_svc_2() on all the 92 * other nodes (slaves) 93 * after each call to mdmn_work_svc_2 the thread goes to sleep and 94 * will be woken up by mdmn_wakeup_master_svc_2() as soon as the 95 * slave node is done with this message. 96 * - In case the slave doesn't respond in a apropriate time, an error 97 * is assumed to ensure the master doesn't wait forever. 98 * 99 * On a slave: 100 * mdmn_work_svc_2() 101 * - starts a thread -> mdmn_slave_process_msg() and returns 102 * mdmn_slave_process_msg() 103 * - processes this message locally by calling the appropriate message 104 * handler, that creates some result. 105 * - sends that result thru a call to mdmn_wakeup_master_svc_2() to 106 * the master. 107 * 108 * Back on the master: 109 * mdmn_wakeup_master_svc_2() 110 * - stores the result into the master_table. 111 * - signals the mdmn_master_process_msg-thread. 112 * - returns 113 * mdmn_master_process_msg() 114 * - after getting the results from all nodes 115 * - sends them back to the initiating node thru a call to 116 * mdmn_wakeup_initiator_svc_2. 117 * 118 * Back on the initiator: 119 * mdmn_wakeup_initiator_svc_2() 120 * - calls svc_sendreply() which makes the call to mdmn_send_svc_2() 121 * return. 122 * which allows the initial mdmn_send_message() call to return. 123 */ 124 125 FILE *commdout; /* debug output for the commd */ 126 char *commdoutfile; /* file name for the above output */ 127 /* want at least 10 MB free space when logging into a file */ 128 #define MIN_FS_SPACE (10LL * 1024 * 1024) 129 130 /* 131 * Number of outstanding messages that were initiated by this node. 132 * If zero, check_timeouts goes to sleep 133 */ 134 uint_t messages_on_their_way; 135 mutex_t check_timeout_mutex; /* need mutex to protect above */ 136 cond_t check_timeout_cv; /* trigger for check_timeouts */ 137 138 /* for printing out time stamps */ 139 hrtime_t __savetime; 140 141 /* RPC clients for every set and every node and their protecting locks */ 142 CLIENT *client[MD_MAXSETS][NNODES]; 143 rwlock_t client_rwlock[MD_MAXSETS]; 144 145 /* the descriptors of all possible sets and their protectors */ 146 struct md_set_desc *set_descriptor[MD_MAXSETS]; 147 rwlock_t set_desc_rwlock[MD_MAXSETS]; 148 149 /* the daemon to daemon communication has to timeout quickly */ 150 static struct timeval FOUR_SECS = { 4, 0 }; 151 152 /* These indicate if a set has already been setup */ 153 int md_mn_set_inited[MD_MAXSETS]; 154 155 /* For every set we have a message completion table and protecting mutexes */ 156 md_mn_mct_t *mct[MD_MAXSETS]; 157 mutex_t mct_mutex[MD_MAXSETS][MD_MN_NCLASSES]; 158 159 /* Stuff to describe the global status of the commd on one node */ 160 #define MD_CGS_INITED 0x0001 161 #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */ 162 uint_t md_commd_global_state = 0; /* No state when starting up */ 163 164 /* 165 * Global verbosity level for the daemon 166 */ 167 uint_t md_commd_global_verb; 168 169 /* 170 * libmeta doesn't like multiple threads in metaget_setdesc(). 171 * So we must protect access to it with a global lock 172 */ 173 mutex_t get_setdesc_mutex; 174 175 /* 176 * Need a way to block single message types, 177 * hence an array with a status for every message type 178 */ 179 uint_t msgtype_lock_state[MD_MN_NMESSAGES]; 180 181 /* for reading in the config file */ 182 #define MAX_LINE_SIZE 1024 183 184 extern char *commd_get_outfile(void); 185 extern uint_t commd_get_verbosity(void); 186 187 /* 188 * mdmn_clnt_create is a helper function for meta_client_create_retry. It 189 * merely needs to call clnt_create_timed, and meta_client_create_retry 190 * will take care of the rest. 191 */ 192 /* ARGSUSED */ 193 static CLIENT * 194 mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out) 195 { 196 md_mnnode_desc *node = (md_mnnode_desc *)data; 197 198 return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp", 199 time_out)); 200 } 201 202 #define FLUSH_DEBUGFILE() \ 203 if (commdout != (FILE *)NULL) { \ 204 (void) fflush(commdout); \ 205 (void) fsync(fileno(commdout)); \ 206 } 207 208 static void 209 panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval, 210 md_mn_result_t *slave_result) 211 { 212 md_mn_commd_err_t commd_err; 213 md_error_t mne = mdnullerror; 214 char *msg_buf; 215 216 msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char)); 217 218 FLUSH_DEBUGFILE(); 219 220 if (master_err != MDMNE_ACK) { 221 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC " 222 "fail on master when processing message type %d\n", type); 223 } else if (slave_result == NULL) { 224 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail " 225 "on node %d when processing message type %d\n", nid, type); 226 } else { 227 (void) snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: " 228 "Inconsistent return value from node %d when processing " 229 "message type %d. Master exitval = %d, " 230 "Slave exitval = %d\n", nid, type, master_exitval, 231 slave_result->mmr_exitval); 232 } 233 commd_err.size = strlen(msg_buf); 234 commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0]; 235 236 (void) metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd"); 237 (void) uadmin(A_DUMP, AD_BOOT, NULL); 238 } 239 240 static void 241 flush_fcout() 242 { 243 struct statvfs64 vfsbuf; 244 long long avail_bytes; 245 int warned = 0; 246 247 for (; ; ) { 248 (void) sleep(10); 249 /* No output file, nothing to do */ 250 if (commdout == (FILE *)NULL) 251 continue; 252 253 /* 254 * stat the appropriate filesystem to check for available space. 255 */ 256 if (statvfs64(commdoutfile, &vfsbuf)) { 257 continue; 258 } 259 260 avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail; 261 /* 262 * If we don't have enough space, we print out a warning. 263 * And we drop the verbosity level to NULL 264 * In case the condtion doesn't go away, we don't repeat 265 * the warning. 266 */ 267 if (avail_bytes < MIN_FS_SPACE) { 268 if (warned) { 269 continue; 270 } 271 commd_debug(MD_MMV_SYSLOG, 272 "NOT enough space available for logging\n"); 273 commd_debug(MD_MMV_SYSLOG, 274 "Have %lld bytes, need %lld bytes\n", 275 avail_bytes, MIN_FS_SPACE); 276 warned = 1; 277 md_commd_global_verb = MD_MMV_NULL; 278 } else { 279 warned = 0; 280 } 281 282 (void) fflush(commdout); 283 } 284 } 285 286 /* safer version of clnt_destroy. If clnt is NULL don't do anything */ 287 #define mdmn_clnt_destroy(clnt) { \ 288 if (clnt) \ 289 clnt_destroy(clnt); \ 290 } 291 292 /* 293 * Own version of svc_sendreply that checks the integrity of the transport 294 * handle and so prevents us from core dumps in the real svc_sendreply() 295 */ 296 void 297 mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data) 298 { 299 if (SVC_STAT(transp) == XPRT_DIED) { 300 commd_debug(MD_MMV_MISC, 301 "mdmn_svc_sendreply: XPRT_DIED\n"); 302 return; 303 } 304 (void) svc_sendreply(transp, xdr, data); 305 } 306 307 /* 308 * timeout_initiator(set, class) 309 * 310 * Alas, I sent a message and didn't get a response back in aproppriate time. 311 * 312 * timeout_initiator() takes care for doing the needed svc_sendreply() to the 313 * calling mdmn_send_message, so that guy doesn't wait forever 314 * What is done here is pretty much the same as what is done in 315 * wakeup initiator. The difference is that we cannot provide for any results, 316 * of course and we set the comm_state to MDMNE_TIMEOUT. 317 * 318 * By doing so, mdmn_send_message can decide if a retry would make sense or not. 319 * It's not our's to decide that here. 320 */ 321 void 322 timeout_initiator(set_t setno, md_mn_msgclass_t class) 323 { 324 SVCXPRT *transp; 325 md_mn_msgid_t mid; 326 md_mn_result_t *resultp; 327 328 resultp = Zalloc(sizeof (md_mn_result_t)); 329 resultp->mmr_comm_state = MDMNE_TIMEOUT; 330 331 commd_debug(MD_MMV_MISC, 332 "timeout_initiator set = %d, class = %d\n", setno, class); 333 334 transp = mdmn_get_initiator_table_transp(setno, class); 335 mdmn_get_initiator_table_id(setno, class, &mid); 336 337 commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n", 338 MSGID_ELEMS(mid)); 339 /* 340 * Give the result the corresponding msgid from the failed message. 341 */ 342 MSGID_COPY(&mid, &(resultp->mmr_msgid)); 343 344 /* return to mdmn_send_message() and let it deal with the situation */ 345 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 346 347 free(resultp); 348 commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n"); 349 svc_done(transp); 350 mdmn_unregister_initiator_table(setno, class); 351 } 352 353 354 /* 355 * check_timeouts - thread 356 * 357 * This implements a timeout surveillance for messages sent from the 358 * initiator to the master. 359 * 360 * If a message is started, this thread is triggered thru 361 * cond_signal(&check_timeout_cv) and we keep track of the numbers of 362 * messages that are outstanding (messages_on_their_way). 363 * 364 * As long as there are messages on their way, this thread never goes to sleep. 365 * It'll keep checking all class/set combinations for outstanding messages. 366 * If one is found, it's checked if this message is overdue. In that case, 367 * timeout_initiator() is called to wakeup the calling mdmn_send_message and 368 * to clean up the mess. 369 * 370 * If the result from the master arrives later, this message is considered 371 * to be unsolicited. And will be ignored. 372 */ 373 374 void 375 check_timeouts() 376 { 377 set_t setno; 378 time_t now, then; 379 mutex_t *mx; 380 md_mn_msgclass_t class; 381 382 for (; ; ) { 383 now = time((time_t *)NULL); 384 for (setno = 1; setno < MD_MAXSETS; setno++) { 385 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 386 continue; 387 } 388 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; 389 class++) { 390 mx = mdmn_get_initiator_table_mx(setno, class); 391 (void) mutex_lock(mx); 392 393 /* then is the registered time */ 394 then = 395 mdmn_get_initiator_table_time(setno, class); 396 if ((then != 0) && (now > then)) { 397 timeout_initiator(setno, class); 398 } 399 (void) mutex_unlock(mx); 400 } 401 } 402 /* it's ok to check only once per second */ 403 (void) sleep(1); 404 405 /* is there work to do? */ 406 (void) mutex_lock(&check_timeout_mutex); 407 if (messages_on_their_way == 0) { 408 (void) cond_wait(&check_timeout_cv, 409 &check_timeout_mutex); 410 } 411 (void) mutex_unlock(&check_timeout_mutex); 412 } 413 } 414 415 void 416 setup_debug(void) 417 { 418 char *tmp_dir; 419 420 /* Read in the debug-controlling tokens from runtime.cf */ 421 md_commd_global_verb = commd_get_verbosity(); 422 /* 423 * If the user didn't specify a verbosity level in runtime.cf 424 * we can safely return here. As we don't intend to printout 425 * debug messages, we don't need to check for the output file. 426 */ 427 if (md_commd_global_verb == 0) { 428 return; 429 } 430 431 /* if commdout is non-NULL it is an open FILE, we'd better close it */ 432 if (commdout != (FILE *)NULL) { 433 (void) fclose(commdout); 434 } 435 436 commdoutfile = commd_get_outfile(); 437 438 /* setup the debug output */ 439 if (commdoutfile == (char *)NULL) { 440 /* if no valid file was specified, use the default */ 441 commdoutfile = "/var/run/commd.out"; 442 commdout = fopen(commdoutfile, "a"); 443 } else { 444 /* check if the directory exists and is writable */ 445 tmp_dir = strdup(commdoutfile); 446 if ((access(dirname(tmp_dir), X_OK|W_OK)) || 447 ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) { 448 syslog(LOG_ERR, 449 "Can't write to specified output file %s,\n" 450 "using /var/run/commd.out instead\n", commdoutfile); 451 free(commdoutfile); 452 commdoutfile = "/var/run/commd.out"; 453 commdout = fopen(commdoutfile, "a"); 454 } 455 free(tmp_dir); 456 } 457 458 if (commdout == (FILE *)NULL) { 459 syslog(LOG_ERR, "Can't write to debug output file %s\n", 460 commdoutfile); 461 } 462 } 463 464 /* 465 * mdmn_is_node_dead checks to see if a node is dead using 466 * the SunCluster infrastructure which is a stable interface. 467 * If unable to contact SunCuster the node is assumed to be alive. 468 * Return values: 469 * 1 - node is dead 470 * 0 - node is alive 471 */ 472 int 473 mdmn_is_node_dead(md_mnnode_desc *node) 474 { 475 char *fmt = "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE "; 476 char *cmd; 477 size_t size; 478 char buf[10]; 479 FILE *ptr; 480 int retval = 0; 481 482 /* I know that I'm alive */ 483 if (strcmp(node->nd_nodename, mynode()) == 0) 484 return (retval); 485 486 size = strlen(fmt) + strlen(node->nd_nodename) + 1; 487 cmd = Zalloc(size); 488 (void) strlcat(cmd, fmt, size); 489 (void) strlcat(cmd, node->nd_nodename, size); 490 491 if ((ptr = popen(cmd, "r")) != NULL) { 492 if (fgets(buf, sizeof (buf), ptr) != NULL) { 493 /* If scha_cluster_get returned DOWN - return dead */ 494 if (strncmp(buf, "DOWN", 4) == 0) 495 retval = 1; 496 } 497 (void) pclose(ptr); 498 } 499 Free(cmd); 500 return (retval); 501 } 502 503 /* 504 * global_init() 505 * 506 * Perform some global initializations. 507 * 508 * the following routines have to call this before operation can start: 509 * - mdmn_send_svc_2 510 * - mdmn_work_svc_2 511 * - mdmn_comm_lock_svc_2 512 * - mdmn_comm_unlock_svc_2 513 * - mdmn_comm_suspend_svc_2 514 * - mdmn_comm_resume_svc_2 515 * - mdmn_comm_reinit_set_svc_2 516 * 517 * This is a single threaded daemon, so it can only be in one of the above 518 * routines at the same time. 519 * This means, global_init() cannot be called more than once at the same time. 520 * Hence, no lock is needed. 521 */ 522 void 523 global_init(void) 524 { 525 set_t set; 526 md_mn_msgclass_t class; 527 struct sigaction sighandler; 528 time_t clock_val; 529 struct rlimit commd_limit; 530 531 532 533 /* Do these global initializations only once */ 534 if (md_commd_global_state & MD_CGS_INITED) { 535 return; 536 } 537 (void) sdssc_bind_library(); 538 539 /* setup the debug options from the config file */ 540 setup_debug(); 541 542 /* make sure that we don't run out of file descriptors */ 543 commd_limit.rlim_cur = commd_limit.rlim_max = RLIM_INFINITY; 544 if (setrlimit(RLIMIT_NOFILE, &commd_limit) != 0) { 545 syslog(LOG_WARNING, gettext("setrlimit failed." 546 "Could not increase the max file descriptors")); 547 } 548 549 /* Make setup_debug() be the action in case of SIGHUP */ 550 sighandler.sa_flags = 0; 551 (void) sigfillset(&sighandler.sa_mask); 552 sighandler.sa_handler = (void (*)(int)) setup_debug; 553 (void) sigaction(SIGHUP, &sighandler, NULL); 554 555 __savetime = gethrtime(); 556 (void) time(&clock_val); 557 commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val)); 558 559 /* start a thread that flushes out the debug on a regular basis */ 560 (void) thr_create(NULL, 0, (void *(*)(void *))flush_fcout, 561 (void *) NULL, THR_DETACHED, NULL); 562 563 /* global rwlock's / mutex's / cond_t's go here */ 564 (void) mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL); 565 (void) cond_init(&check_timeout_cv, USYNC_THREAD, NULL); 566 (void) mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL); 567 568 /* Make sure the initiator table is initialized correctly */ 569 for (set = 0; set < MD_MAXSETS; set++) { 570 for (class = 0; class < MD_MN_NCLASSES; class++) { 571 mdmn_unregister_initiator_table(set, class); 572 } 573 } 574 575 576 /* setup the check for timeouts */ 577 (void) thr_create(NULL, 0, (void *(*)(void *))check_timeouts, 578 (void *) NULL, THR_DETACHED, NULL); 579 580 md_commd_global_state |= MD_CGS_INITED; 581 } 582 583 584 /* 585 * mdmn_init_client(setno, nodeid) 586 * called if client[setno][nodeid] is NULL 587 * 588 * NOTE: Must be called with set_desc_rwlock held as a reader 589 * NOTE: Must be called with client_rwlock held as a writer 590 * 591 * If the rpc client for this node has not been setup for any set, we do it now. 592 * 593 * Returns 0 on success (node found in set, rpc client setup) 594 * -1 if metaget_setdesc failed, 595 * -2 if node not part of set 596 * -3 if clnt_create fails 597 */ 598 static int 599 mdmn_init_client(set_t setno, md_mn_nodeid_t nid) 600 { 601 md_error_t ep = mdnullerror; 602 md_mnnode_desc *node; 603 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 604 605 sd = set_descriptor[setno]; 606 607 /* 608 * Is the appropriate set_descriptor already initialized ? 609 * Can't think of a scenario where this is not the case, but we'd better 610 * check for it anyway. 611 */ 612 if (sd == NULL) { 613 mdsetname_t *sp; 614 615 /* readlock -> writelock */ 616 (void) rw_unlock(&set_desc_rwlock[setno]); 617 (void) rw_wrlock(&set_desc_rwlock[setno]); 618 sp = metasetnosetname(setno, &ep); 619 /* Only one thread is supposed to be in metaget_setdesc() */ 620 (void) mutex_lock(&get_setdesc_mutex); 621 sd = metaget_setdesc(sp, &ep); 622 (void) mutex_unlock(&get_setdesc_mutex); 623 if (sd == NULL) { 624 /* back to ... */ 625 (void) rw_unlock(&set_desc_rwlock[setno]); 626 /* ... readlock */ 627 (void) rw_rdlock(&set_desc_rwlock[setno]); 628 return (-1); 629 } 630 set_descriptor[setno] = sd; 631 /* back to readlock */ 632 (void) rw_unlock(&set_desc_rwlock[setno]); 633 (void) rw_rdlock(&set_desc_rwlock[setno]); 634 } 635 636 /* first we have to find the node name for this node id */ 637 for (node = sd->sd_nodelist; node; node = node->nd_next) { 638 if (node->nd_nodeid == nid) 639 break; /* we found our node in this set */ 640 } 641 642 643 if (node == (md_mnnode_desc *)NULL) { 644 commd_debug(MD_MMV_SYSLOG, 645 "FATAL: node %d not found in set %d\n", nid, setno); 646 (void) rw_unlock(&set_desc_rwlock[setno]); 647 return (-2); 648 } 649 650 commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n", 651 node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags); 652 653 /* Did this node join the diskset? */ 654 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 655 commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n", 656 node->nd_nodename ? node->nd_nodename : "NULL", setno); 657 (void) rw_unlock(&set_desc_rwlock[setno]); 658 return (-2); 659 } 660 661 /* if clnt_create has not been done for that node, do it now */ 662 if (client[setno][nid] == (CLIENT *) NULL) { 663 time_t tout = 0; 664 665 /* 666 * While trying to create a connection to a node, 667 * periodically check to see if the node has been marked 668 * dead by the SunCluster infrastructure. 669 * This periodic check is needed since a non-responsive 670 * rpc.mdcommd (while it is attempting to create a connection 671 * to a dead node) can lead to large delays and/or failures 672 * in the reconfig steps. 673 */ 674 while ((client[setno][nid] == (CLIENT *) NULL) && 675 (tout < MD_CLNT_CREATE_TOUT)) { 676 client[setno][nid] = meta_client_create_retry( 677 node->nd_nodename, mdmn_clnt_create, 678 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 679 /* Is the node dead? */ 680 if (mdmn_is_node_dead(node) == 1) { 681 commd_debug(MD_MMV_SYSLOG, 682 "rpc.mdcommd: no client for dead node %s\n", 683 node->nd_nodename); 684 break; 685 } else 686 tout += MD_CLNT_CREATE_SUBTIMEOUT; 687 } 688 689 if (client[setno][nid] == (CLIENT *) NULL) { 690 clnt_pcreateerror(node->nd_nodename); 691 (void) rw_unlock(&set_desc_rwlock[setno]); 692 return (-3); 693 } 694 /* this node has the license to send */ 695 commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n"); 696 add_license(node); 697 698 /* set the timeout value */ 699 clnt_control(client[setno][nid], CLSET_TIMEOUT, 700 (char *)&FOUR_SECS); 701 702 } 703 (void) rw_unlock(&set_desc_rwlock[setno]); 704 return (0); 705 } 706 707 /* 708 * check_client(setno, nodeid) 709 * 710 * must be called with reader lock held for set_desc_rwlock[setno] 711 * and must be called with reader lock held for client_rwlock[setno] 712 * Checks if the client for this set/node combination is already setup 713 * if not it upgrades the lock to a writer lock 714 * and tries to initialize the client. 715 * Finally it's checked if the client nulled out again due to some race 716 * 717 * returns 0 if there is a usable client 718 * returns MDMNE_RPC_FAIL otherwise 719 */ 720 static int 721 check_client(set_t setno, md_mn_nodeid_t nodeid) 722 { 723 int ret = 0; 724 725 while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) { 726 /* upgrade reader ... */ 727 (void) rw_unlock(&client_rwlock[setno]); 728 /* ... to writer lock. */ 729 (void) rw_wrlock(&client_rwlock[setno]); 730 if (mdmn_init_client(setno, nodeid) != 0) { 731 ret = MDMNE_RPC_FAIL; 732 } 733 /* downgrade writer ... */ 734 (void) rw_unlock(&client_rwlock[setno]); 735 /* ... back to reader lock. */ 736 (void) rw_rdlock(&client_rwlock[setno]); 737 } 738 return (ret); 739 } 740 741 /* 742 * mdmn_init_set(setno, todo) 743 * setno is the number of the set to be initialized. 744 * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY 745 * If called with MDMN_SET_READY everything is initialized. 746 * 747 * If the set mutexes are already initialized, the caller has to hold 748 * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before 749 * calling mdmn_init_set() 750 */ 751 int 752 mdmn_init_set(set_t setno, int todo) 753 { 754 int class; 755 md_mnnode_desc *node; 756 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 757 mdsetname_t *sp; 758 md_error_t ep = mdnullerror; 759 md_mn_nodeid_t nid; 760 761 /* 762 * Check if we are told to setup the mutexes and 763 * if these are not yet setup 764 */ 765 if ((todo & MDMN_SET_MUTEXES) && 766 ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) { 767 (void) mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL); 768 (void) cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL); 769 (void) rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL); 770 (void) rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL); 771 772 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 773 (void) mutex_init(mdmn_get_master_table_mx(setno, 774 class), USYNC_THREAD, NULL); 775 (void) cond_init(mdmn_get_master_table_cv(setno, class), 776 USYNC_THREAD, NULL); 777 (void) mutex_init(mdmn_get_initiator_table_mx(setno, 778 class), USYNC_THREAD, NULL); 779 } 780 md_mn_set_inited[setno] |= MDMN_SET_MUTEXES; 781 } 782 if ((todo & MDMN_SET_MCT) && 783 ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) { 784 int fd; 785 size_t filesize; 786 caddr_t addr; 787 char table_name[32]; 788 struct flock fl; 789 790 filesize = (sizeof (md_mn_mct_t)); 791 (void) snprintf(table_name, sizeof (table_name), "%s%d", 792 MD_MN_MSG_COMP_TABLE, setno); 793 /* 794 * If the mct file exists we map it into memory. 795 * Otherwise we create an empty file of appropriate 796 * size and map that into memory. 797 * The mapped areas are stored in mct[setno]. 798 */ 799 fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600); 800 if (fd < 0) { 801 commd_debug(MD_MMV_MISC, 802 "init_set: Can't open MCT\n"); 803 return (-1); 804 } 805 /* 806 * Ensure that we are the only process that has this file 807 * mapped. If another instance of rpc.mdcommd has beaten us 808 * then we display the failing process and attempt to terminate 809 * it. The next call of this routine should establish us as 810 * the only rpc.mdcommd on the system. 811 */ 812 (void) memset(&fl, 0, sizeof (fl)); 813 fl.l_type = F_WRLCK; 814 fl.l_whence = SEEK_SET; 815 fl.l_start = 0; 816 fl.l_len = filesize + 1; 817 818 if (fcntl(fd, F_SETLK, &fl) == -1) { 819 commd_debug(MD_MMV_SYSLOG, 820 "init_set: Cannot lock MCT '%s'\n", table_name); 821 if (fcntl(fd, F_GETLK, &fl) != -1) { 822 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:" 823 "Process %d holds lock\n", fl.l_pid); 824 (void) close(fd); 825 } else { 826 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:" 827 "F_GETLK failed\n"); 828 (void) close(fd); 829 return (-1); 830 } 831 832 /* 833 * Try to terminate other mdcommd process so that we 834 * can establish ourselves. 835 */ 836 if (sigsend(P_PID, fl.l_pid, 0) == 0) { 837 if (sigsend(P_PID, fl.l_pid, SIGKILL) < 0) { 838 commd_debug(MD_MMV_SYSLOG, 839 "rpc.mdcommd:" 840 "SIGKILL of %d failed\n", fl.l_pid); 841 } else { 842 commd_debug(MD_MMV_SYSLOG, 843 "rpc.mdcommd:" 844 "Process %d killed\n", fl.l_pid); 845 } 846 } else { 847 commd_debug(MD_MMV_SYSLOG, "rpc.mdcommd:" 848 "Process %d not killable\n", fl.l_pid); 849 } 850 return (-1); 851 } 852 /* 853 * To ensure that the file has the appropriate size, 854 * we write a byte at the end of the file. 855 */ 856 (void) lseek(fd, filesize + 1, SEEK_SET); 857 (void) write(fd, "\0", 1); 858 859 /* at this point we have a file in place that we can mmap */ 860 addr = mmap(0, filesize, PROT_READ | PROT_WRITE, 861 MAP_SHARED, fd, (off_t)0); 862 if (addr == MAP_FAILED) { 863 commd_debug(MD_MMV_INIT, 864 "init_set: mmap mct error %d\n", 865 errno); 866 return (-1); 867 } 868 /* LINTED pointer alignment */ 869 mct[setno] = (md_mn_mct_t *)addr; 870 871 /* finally we initialize the mutexes that protect the mct */ 872 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 873 (void) mutex_init(&(mct_mutex[setno][class]), 874 USYNC_THREAD, NULL); 875 } 876 877 md_mn_set_inited[setno] |= MDMN_SET_MCT; 878 } 879 /* 880 * Check if we are told to setup the nodes and 881 * if these are not yet setup 882 * (Attention: negative logic here compared to above!) 883 */ 884 if (((todo & MDMN_SET_NODES) == 0) || 885 (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 886 return (0); /* success */ 887 } 888 889 if ((sp = metasetnosetname(setno, &ep)) == NULL) { 890 commd_debug(MD_MMV_SYSLOG, 891 "metasetnosetname(%d) returned NULL\n", setno); 892 return (MDMNE_NOT_JOINED); 893 } 894 895 /* flush local copy of rpc.metad data */ 896 metaflushsetname(sp); 897 898 (void) mutex_lock(&get_setdesc_mutex); 899 sd = metaget_setdesc(sp, &ep); 900 (void) mutex_unlock(&get_setdesc_mutex); 901 902 if (sd == NULL) { 903 commd_debug(MD_MMV_SYSLOG, 904 "metaget_setdesc(%d) returned NULL\n", setno); 905 return (MDMNE_NOT_JOINED); 906 } 907 908 /* 909 * if this set is not a multinode set or 910 * this node didn't join yet the diskset, better don't do anything 911 */ 912 if ((MD_MNSET_DESC(sd) == 0) || 913 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) { 914 commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno); 915 return (MDMNE_NOT_JOINED); 916 } 917 918 for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) { 919 time_t tout = 0; 920 nid = node->nd_nodeid; 921 922 commd_debug(MD_MMV_INIT, 923 "setting up: node=%s, priv_ic=%s, flags=0x%x\n", 924 node->nd_nodename ? node->nd_nodename : "NULL", 925 node->nd_priv_ic ? node->nd_priv_ic : "NULL", 926 node->nd_flags); 927 928 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 929 commd_debug(MD_MMV_INIT, 930 "init: %s didn't join set %d\n", 931 node->nd_nodename ? node->nd_nodename : "NULL", 932 setno); 933 continue; 934 } 935 936 if (client[setno][nid] != (CLIENT *) NULL) { 937 /* already inited */ 938 commd_debug(MD_MMV_INIT, "init: already: node=%s\n", 939 node->nd_nodename ? node->nd_nodename : "NULL"); 940 continue; 941 } 942 943 /* 944 * While trying to create a connection to a node, 945 * periodically check to see if the node has been marked 946 * dead by the SunCluster infrastructure. 947 * This periodic check is needed since a non-responsive 948 * rpc.mdcommd (while it is attempting to create a connection 949 * to a dead node) can lead to large delays and/or failures 950 * in the reconfig steps. 951 */ 952 while ((client[setno][nid] == (CLIENT *) NULL) && 953 (tout < MD_CLNT_CREATE_TOUT)) { 954 client[setno][nid] = meta_client_create_retry( 955 node->nd_nodename, mdmn_clnt_create, 956 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 957 /* Is the node dead? */ 958 if (mdmn_is_node_dead(node) == 1) { 959 commd_debug(MD_MMV_SYSLOG, 960 "rpc.mdcommd: no client for dead node %s\n", 961 node->nd_nodename); 962 break; 963 } else 964 tout += MD_CLNT_CREATE_SUBTIMEOUT; 965 } 966 967 if (client[setno][nid] == (CLIENT *) NULL) { 968 clnt_pcreateerror(node->nd_nodename); 969 /* 970 * If we cannot connect to a single node 971 * (maybe because it is down) we mark this node as not 972 * owned and continue with the next node in the list. 973 * This is better than failing the entire starting up 974 * of the commd system. 975 */ 976 node->nd_flags &= ~MD_MN_NODE_OWN; 977 commd_debug(MD_MMV_SYSLOG, 978 "WARNING couldn't create client for %s\n" 979 "Reconfig cycle required\n", 980 node->nd_nodename); 981 commd_debug(MD_MMV_INIT, 982 "WARNING couldn't create client for %s\n" 983 "Reconfig cycle required\n", 984 node->nd_nodename); 985 continue; 986 } 987 /* this node has the license to send */ 988 commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n"); 989 add_license(node); 990 991 /* set the timeout value */ 992 clnt_control(client[setno][nid], CLSET_TIMEOUT, 993 (char *)&FOUR_SECS); 994 995 commd_debug(MD_MMV_INIT, "init: done: node=%s\n", 996 node->nd_nodename ? node->nd_nodename : "NULL"); 997 } 998 999 set_descriptor[setno] = sd; 1000 md_mn_set_inited[setno] |= MDMN_SET_NODES; 1001 return (0); /* success */ 1002 } 1003 1004 void * 1005 mdmn_send_to_work(void *arg) 1006 { 1007 int *rpc_err = NULL; 1008 int success; 1009 int try_master; 1010 set_t setno; 1011 mutex_t *mx; /* protection for initiator_table */ 1012 SVCXPRT *transp; 1013 md_mn_msg_t *msg; 1014 md_mn_nodeid_t set_master; 1015 md_mn_msgclass_t class; 1016 md_mn_msg_and_transp_t *matp = (md_mn_msg_and_transp_t *)arg; 1017 1018 msg = matp->mat_msg; 1019 transp = matp->mat_transp; 1020 1021 class = mdmn_get_message_class(msg->msg_type); 1022 setno = msg->msg_setno; 1023 1024 /* set the sender, so the master knows who to send the results */ 1025 (void) rw_rdlock(&set_desc_rwlock[setno]); 1026 msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 1027 set_master = set_descriptor[setno]->sd_mn_master_nodeid; 1028 1029 mx = mdmn_get_initiator_table_mx(setno, class); 1030 (void) mutex_lock(mx); 1031 1032 /* 1033 * Here we check, if the initiator table slot for this set/class 1034 * combination is free to use. 1035 * If this is not the case, we return CLASS_BUSY forcing the 1036 * initiating send_message call to retry 1037 */ 1038 success = mdmn_check_initiator_table(setno, class); 1039 if (success == MDMNE_CLASS_BUSY) { 1040 md_mn_msgid_t active_mid; 1041 1042 mdmn_get_initiator_table_id(setno, class, &active_mid); 1043 1044 commd_debug(MD_MMV_SEND, 1045 "send_to_work: received but locally busy " 1046 "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 1047 "active msg=(%d, 0x%llx-%d)\n", 1048 MSGID_ELEMS(msg->msg_msgid), setno, class, 1049 msg->msg_type, MSGID_ELEMS(active_mid)); 1050 } else { 1051 commd_debug(MD_MMV_SEND, 1052 "send_to_work: received (%d, 0x%llx-%d), " 1053 "set=%d, class=%d, type=%d\n", 1054 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 1055 } 1056 1057 try_master = 2; /* return failure after two retries */ 1058 while ((success == MDMNE_ACK) && (try_master--)) { 1059 (void) rw_rdlock(&client_rwlock[setno]); 1060 /* is the rpc client to the master still around ? */ 1061 if (check_client(setno, set_master)) { 1062 success = MDMNE_RPC_FAIL; 1063 FLUSH_DEBUGFILE(); 1064 (void) rw_unlock(&client_rwlock[setno]); 1065 break; /* out of try_master-loop */ 1066 } 1067 1068 /* 1069 * Send the request to the work function on the master 1070 * this call will return immediately 1071 */ 1072 rpc_err = mdmn_work_2(msg, client[setno][set_master], 1073 set_master); 1074 1075 /* Everything's Ok? */ 1076 if (rpc_err == NULL) { 1077 success = MDMNE_RPC_FAIL; 1078 /* 1079 * Probably something happened to the daemon on the 1080 * master. Kill the client, and try again... 1081 */ 1082 (void) rw_unlock(&client_rwlock[setno]); 1083 (void) rw_wrlock(&client_rwlock[setno]); 1084 mdmn_clnt_destroy(client[setno][set_master]); 1085 if (client[setno][set_master] != (CLIENT *)NULL) { 1086 client[setno][set_master] = (CLIENT *)NULL; 1087 } 1088 (void) rw_unlock(&client_rwlock[setno]); 1089 continue; 1090 1091 } else if (*rpc_err != MDMNE_ACK) { 1092 /* something went wrong, break out */ 1093 success = *rpc_err; 1094 free(rpc_err); 1095 (void) rw_unlock(&client_rwlock[setno]); 1096 break; /* out of try_master-loop */ 1097 } 1098 1099 (void) rw_unlock(&client_rwlock[setno]); 1100 free(rpc_err); 1101 1102 /* 1103 * If we are here, we sucessfully delivered the message. 1104 * We register the initiator_table, so that 1105 * wakeup_initiator_2 can do the sendreply with the 1106 * results for us. 1107 */ 1108 success = MDMNE_ACK; 1109 mdmn_register_initiator_table(setno, class, msg, transp); 1110 1111 /* tell check_timeouts, there's work to do */ 1112 (void) mutex_lock(&check_timeout_mutex); 1113 messages_on_their_way++; 1114 (void) cond_signal(&check_timeout_cv); 1115 (void) mutex_unlock(&check_timeout_mutex); 1116 break; /* out of try_master-loop */ 1117 } 1118 1119 (void) rw_unlock(&set_desc_rwlock[setno]); 1120 1121 if (success == MDMNE_ACK) { 1122 commd_debug(MD_MMV_SEND, 1123 "send_to_work: registered (%d, 0x%llx-%d)\n", 1124 MSGID_ELEMS(msg->msg_msgid)); 1125 } else { 1126 /* In case of failure do the sendreply now */ 1127 md_mn_result_t *resultp; 1128 resultp = Zalloc(sizeof (md_mn_result_t)); 1129 resultp->mmr_comm_state = success; 1130 /* 1131 * copy the MSGID so that we know _which_ message 1132 * failed (if the transp has got mangled) 1133 */ 1134 MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid)); 1135 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 1136 commd_debug(MD_MMV_SEND, 1137 "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n", 1138 MSGID_ELEMS(msg->msg_msgid), success); 1139 free_result(resultp); 1140 /* 1141 * We don't have a timeout registered to wake us up, so we're 1142 * now done with this handle. Release it back to the pool. 1143 */ 1144 svc_done(transp); 1145 1146 } 1147 1148 free_msg(msg); 1149 /* the alloc was done in mdmn_send_svc_2 */ 1150 Free(matp); 1151 (void) mutex_unlock(mx); 1152 return (NULL); 1153 1154 } 1155 1156 /* 1157 * do_message_locally(msg, result) 1158 * Process a message locally on the master 1159 * Lookup the MCT if the message has already been processed. 1160 * If not, call the handler and store the result 1161 * If yes, retrieve the result from the MCT. 1162 * Return: 1163 * MDMNE_ACK in case of success 1164 * MDMNE_LOG_FAIL if the MCT could not be checked 1165 */ 1166 static int 1167 do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result) 1168 { 1169 int completed; 1170 set_t setno; 1171 md_mn_msgtype_t msgtype = msg->msg_type; 1172 md_mn_msgclass_t class; 1173 1174 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 1175 1176 handler = mdmn_get_handler(msgtype); 1177 if (handler == NULL) { 1178 result->mmr_exitval = 0; 1179 /* let the sender decide if this is an error or not */ 1180 result->mmr_comm_state = MDMNE_NO_HANDLER; 1181 return (MDMNE_NO_HANDLER); 1182 } 1183 1184 class = mdmn_get_message_class(msg->msg_type); 1185 setno = msg->msg_setno; 1186 1187 result->mmr_msgtype = msgtype; 1188 result->mmr_flags = msg->msg_flags; 1189 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 1190 1191 (void) mutex_lock(&mct_mutex[setno][class]); 1192 completed = mdmn_check_completion(msg, result); 1193 if (completed == MDMN_MCT_NOT_DONE) { 1194 /* message not yet processed locally */ 1195 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1196 "calling handler for (%d,0x%llx-%d) type %d\n", 1197 MSGID_ELEMS(msg->msg_msgid), msgtype); 1198 1199 /* 1200 * Mark the message as being currently processed, 1201 * so we won't start a second handler for it 1202 */ 1203 (void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS); 1204 (void) mutex_unlock(&mct_mutex[setno][class]); 1205 1206 /* here we actually process the message on the master */ 1207 (*handler)(msg, MD_MSGF_ON_MASTER, result); 1208 1209 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1210 "finished handler for (%d,0x%llx-%d) type %d\n", 1211 MSGID_ELEMS(msg->msg_msgid), msgtype); 1212 1213 /* Mark the message as fully processed, store the result */ 1214 (void) mutex_lock(&mct_mutex[setno][class]); 1215 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 1216 } else if (completed == MDMN_MCT_DONE) { 1217 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1218 "result for (%d, 0x%llx-%d) from MCT\n", 1219 MSGID_ELEMS(msg->msg_msgid), msgtype); 1220 } else if (completed == MDMN_MCT_IN_PROGRESS) { 1221 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1222 "(%d, 0x%llx-%d) is currently being processed\n", 1223 MSGID_ELEMS(msg->msg_msgid), msgtype); 1224 } else { 1225 /* MCT error occurred (should never happen) */ 1226 (void) mutex_unlock(&mct_mutex[setno][class]); 1227 result->mmr_comm_state = MDMNE_LOG_FAIL; 1228 commd_debug(MD_MMV_SYSLOG, "WARNING " 1229 "mdmn_check_completion returned %d " 1230 "for (%d,0x%llx-%d)\n", completed, 1231 MSGID_ELEMS(msg->msg_msgid)); 1232 return (MDMNE_LOG_FAIL); 1233 } 1234 (void) mutex_unlock(&mct_mutex[setno][class]); 1235 return (MDMNE_ACK); 1236 1237 } 1238 1239 /* 1240 * do_send_message(msg, node) 1241 * 1242 * Send a message to a given node and wait for a acknowledgment, that the 1243 * message has arrived on the remote node. 1244 * Make sure that the client for the set is setup correctly. 1245 * If no ACK arrives, destroy and recreate the RPC client and retry the 1246 * message one time 1247 * After actually sending wait no longer than the appropriate number of 1248 * before timing out the message. 1249 * 1250 * Note must be called with set_desc_wrlock held in reader mode 1251 */ 1252 static int 1253 do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node) 1254 { 1255 int err; 1256 int rpc_retries; 1257 int timeout_retries = 0; 1258 int *ret = NULL; 1259 set_t setno; 1260 cond_t *cv; /* see mdmn_wakeup_master_svc_2 */ 1261 mutex_t *mx; /* protection for class_busy */ 1262 timestruc_t timeout; /* surveillance for remote daemon */ 1263 md_mn_nodeid_t nid; 1264 md_mn_msgtype_t msgtype; 1265 md_mn_msgclass_t class; 1266 1267 nid = node->nd_nodeid; 1268 msgtype = msg->msg_type; 1269 setno = msg->msg_setno; 1270 class = mdmn_get_message_class(msgtype); 1271 mx = mdmn_get_master_table_mx(setno, class); 1272 cv = mdmn_get_master_table_cv(setno, class); 1273 1274 retry_rpc: 1275 1276 /* We try two times to send the message */ 1277 rpc_retries = 2; 1278 1279 /* 1280 * if sending the message doesn't succeed the first time due to a 1281 * RPC problem, we retry one time 1282 */ 1283 while ((rpc_retries != 0) && (ret == NULL)) { 1284 /* in abort state, we error out immediately */ 1285 if (md_commd_global_state & MD_CGS_ABORTED) { 1286 return (MDMNE_ABORT); 1287 } 1288 1289 (void) rw_rdlock(&client_rwlock[setno]); 1290 /* unable to create client? Ignore it */ 1291 if (check_client(setno, nid)) { 1292 /* 1293 * In case we cannot establish an RPC client, we 1294 * take this node out of our considerations. 1295 * This will be reset by a reconfig 1296 * cycle that should come pretty soon. 1297 * MNISSUE: Should a reconfig cycle 1298 * be forced on SunCluster? 1299 */ 1300 node->nd_flags &= ~MD_MN_NODE_OWN; 1301 commd_debug(MD_MMV_SYSLOG, 1302 "WARNING couldn't create client for %s\n" 1303 "Reconfig cycle required\n", 1304 node->nd_nodename); 1305 commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) " 1306 "WARNING couldn't create client for %s\n", 1307 MSGID_ELEMS(msg->msg_msgid), node->nd_nodename); 1308 (void) rw_unlock(&client_rwlock[setno]); 1309 return (MDMNE_IGNORE_NODE); 1310 } 1311 /* let's be paranoid and check again before sending */ 1312 if (client[setno][nid] == NULL) { 1313 /* 1314 * if this is true, strange enough, we catch our breath, 1315 * and then continue, so that the client is set up 1316 * once again. 1317 */ 1318 commd_debug(MD_MMV_PROC_M, "client is NULL\n"); 1319 (void) rw_unlock(&client_rwlock[setno]); 1320 (void) sleep(1); 1321 continue; 1322 } 1323 1324 /* send it over, it will return immediately */ 1325 ret = mdmn_work_2(msg, client[setno][nid], nid); 1326 1327 (void) rw_unlock(&client_rwlock[setno]); 1328 1329 if (ret != NULL) { 1330 commd_debug(MD_MMV_PROC_M, 1331 "proc_mas: sending (%d,0x%llx-%d) to %d returned " 1332 " 0x%x\n", 1333 MSGID_ELEMS(msg->msg_msgid), nid, *ret); 1334 } else { 1335 commd_debug(MD_MMV_PROC_M, 1336 "proc_mas: sending (%d,0x%llx-%d) to %d returned " 1337 " NULL \n", 1338 MSGID_ELEMS(msg->msg_msgid), nid); 1339 } 1340 1341 if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) || 1342 (*ret == MDMNE_THR_CREATE_FAIL)) { 1343 /* 1344 * Something happened to the daemon on the other side. 1345 * Kill the client, and try again. 1346 * check_client() will create a new client 1347 */ 1348 (void) rw_wrlock(&client_rwlock[setno]); 1349 mdmn_clnt_destroy(client[setno][nid]); 1350 if (client[setno][nid] != (CLIENT *)NULL) { 1351 client[setno][nid] = (CLIENT *)NULL; 1352 } 1353 (void) rw_unlock(&client_rwlock[setno]); 1354 1355 /* ... but don't try infinitely */ 1356 --rpc_retries; 1357 continue; 1358 } 1359 /* 1360 * If the class is locked on the other node, keep trying. 1361 * This situation will go away automatically, 1362 * if we wait long enough 1363 */ 1364 if (*ret == MDMNE_CLASS_LOCKED) { 1365 (void) sleep(1); 1366 free(ret); 1367 ret = NULL; 1368 continue; 1369 } 1370 } 1371 if (ret == NULL) { 1372 return (MDMNE_RPC_FAIL); 1373 } 1374 1375 1376 /* if the slave is in abort state, we just ignore it. */ 1377 if (*ret == MDMNE_ABORT) { 1378 commd_debug(MD_MMV_PROC_M, 1379 "proc_mas: work(%d,0x%llx-%d) returned " 1380 "MDMNE_ABORT\n", 1381 MSGID_ELEMS(msg->msg_msgid)); 1382 free(ret); 1383 return (MDMNE_IGNORE_NODE); 1384 } 1385 1386 /* Did the remote processing succeed? */ 1387 if (*ret != MDMNE_ACK) { 1388 /* 1389 * Some commd failure in the middle of sending the msg 1390 * to the nodes. We don't continue here. 1391 */ 1392 commd_debug(MD_MMV_PROC_M, 1393 "proc_mas: work(%d,0x%llx-%d) returns %d\n", 1394 MSGID_ELEMS(msg->msg_msgid), *ret); 1395 free(ret); 1396 return (MDMNE_RPC_FAIL); 1397 } 1398 free(ret); 1399 ret = NULL; 1400 1401 /* 1402 * When we are here, we have sent the message to the other node and 1403 * we know that node has accepted it. 1404 * We go to sleep and have trust to be woken up by wakeup. 1405 * If we wakeup due to a timeout, or a signal, no result has been 1406 * placed in the appropriate slot. 1407 * If we timeout, it is likely that this is because the node has 1408 * gone away, so we will destroy the client and try it again in the 1409 * expectation that the rpc will fail and we will return 1410 * MDMNE_IGNORE_NODE. If that is not the case, the message must still 1411 * be being processed on the slave. In this case just timeout for 4 1412 * more seconds and then return RPC_FAIL if the message is not complete. 1413 */ 1414 timeout.tv_nsec = 0; 1415 timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) : 1416 FOUR_SECS.tv_sec; 1417 err = cond_reltimedwait(cv, mx, &timeout); 1418 1419 if (err == 0) { 1420 /* everything's fine, return success */ 1421 return (MDMNE_ACK); 1422 } 1423 1424 if (err == ETIME) { 1425 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1426 "timeout occured, set=%d, class=%d, " 1427 "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n", 1428 setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries); 1429 if (timeout_retries == 0) { 1430 timeout_retries++; 1431 /* 1432 * Destroy the client and try the rpc call again 1433 */ 1434 (void) rw_wrlock(&client_rwlock[setno]); 1435 mdmn_clnt_destroy(client[setno][nid]); 1436 client[setno][nid] = (CLIENT *)NULL; 1437 (void) rw_unlock(&client_rwlock[setno]); 1438 goto retry_rpc; 1439 } 1440 } else if (err == EINTR) { 1441 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1442 "commd signalled, set=%d, class=%d, " 1443 "msgid=(%d, 0x%llx-%d)\n", 1444 setno, class, MSGID_ELEMS(msg->msg_msgid)); 1445 } else { 1446 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1447 "cond_reltimedwait err=%d, set=%d, " 1448 "class=%d, msgid=(%d, 0x%llx-%d)\n", 1449 err, setno, class, 1450 MSGID_ELEMS(msg->msg_msgid)); 1451 } 1452 1453 /* some failure happened */ 1454 return (MDMNE_RPC_FAIL); 1455 } 1456 1457 /* 1458 * before we return we have to 1459 * free_msg(msg); because we are working on a copied message 1460 */ 1461 void 1462 mdmn_master_process_msg(md_mn_msg_t *msg) 1463 { 1464 int *ret; 1465 int err; 1466 int nmsgs; /* total number of msgs */ 1467 int curmsg; /* index of current msg */ 1468 set_t setno; 1469 uint_t inherit_flags = 0; 1470 uint_t secdiff, usecdiff; /* runtime of this message */ 1471 md_error_t mde = mdnullerror; 1472 md_mn_msg_t *msglist[MAX_SUBMESSAGES]; /* all msgs to process */ 1473 md_mn_msg_t *cmsg; /* current msg */ 1474 md_mn_msgid_t dummyid; 1475 md_mn_result_t *result; 1476 md_mn_result_t *slave_result; 1477 md_mn_nodeid_t sender; 1478 md_mn_nodeid_t set_master; 1479 md_mnnode_desc *node; 1480 md_mn_msgtype_t orig_type; /* type of the original message */ 1481 md_mn_msgtype_t msgtype; /* type of the current message */ 1482 md_mn_msgclass_t orig_class; /* class of the original message */ 1483 md_mn_msgclass_t class; /* class of the current message */ 1484 1485 int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist); 1486 1487 orig_type = msgtype = msg->msg_type; 1488 sender = msg->msg_sender; 1489 setno = msg->msg_setno; 1490 1491 result = Zalloc(sizeof (md_mn_result_t)); 1492 result->mmr_setno = setno; 1493 result->mmr_msgtype = msgtype; 1494 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 1495 1496 orig_class = mdmn_get_message_class(msgtype); 1497 1498 commd_debug(MD_MMV_PROC_M, 1499 "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 1500 MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype); 1501 1502 (void) rw_rdlock(&set_desc_rwlock[setno]); 1503 set_master = set_descriptor[setno]->sd_mn_master_nodeid; 1504 result->mmr_sender = set_master; 1505 /* 1506 * Put message into the change log unless told otherwise 1507 * Note that we only log original messages. 1508 * If they are generated by some smgen, we don't log them! 1509 * Replay messages aren't logged either. 1510 * Note, that replay messages are unlogged on completion. 1511 */ 1512 if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) { 1513 commd_debug(MD_MMV_PROC_M, 1514 "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n", 1515 MSGID_ELEMS(msg->msg_msgid), msgtype); 1516 err = mdmn_log_msg(msg); 1517 if (err == MDMNE_NULL) { 1518 /* msg logged successfully */ 1519 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1520 "done log_msg for (%d,0x%llx-%d) type %d\n", 1521 MSGID_ELEMS(msg->msg_msgid), msgtype); 1522 goto proceed; 1523 } 1524 if (err == MDMNE_ACK) { 1525 /* Same msg in the slot, proceed */ 1526 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1527 "already logged (%d,0x%llx-%d) type %d\n", 1528 MSGID_ELEMS(msg->msg_msgid), msgtype); 1529 goto proceed; 1530 } 1531 if (err == MDMNE_LOG_FAIL) { 1532 /* Oh, bad, the log is non functional. */ 1533 result->mmr_comm_state = MDMNE_LOG_FAIL; 1534 /* 1535 * Note that the mark_busy was already done by 1536 * mdmn_work_svc_2() 1537 */ 1538 (void) mutex_lock(&mdmn_busy_mutex[setno]); 1539 mdmn_mark_class_unbusy(setno, orig_class); 1540 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 1541 1542 } 1543 if (err == MDMNE_CLASS_BUSY) { 1544 /* 1545 * The log is occupied with a different message 1546 * that needs to be played first. 1547 * We reject the current message with MDMNE_CLASS_BUSY 1548 * to the initiator and do not unbusy the set/class, 1549 * because we will proceed with the logged message, 1550 * which has the same set/class combination 1551 */ 1552 result->mmr_comm_state = MDMNE_CLASS_BUSY; 1553 } 1554 ret = (int *)NULL; 1555 (void) rw_rdlock(&client_rwlock[setno]); 1556 1557 if (check_client(setno, sender)) { 1558 commd_debug(MD_MMV_SYSLOG, 1559 "proc_mas: No client for initiator \n"); 1560 } else { 1561 ret = mdmn_wakeup_initiator_2(result, 1562 client[setno][sender], sender); 1563 } 1564 (void) rw_unlock(&client_rwlock[setno]); 1565 1566 if (ret == (int *)NULL) { 1567 commd_debug(MD_MMV_SYSLOG, 1568 "proc_mas: couldn't wakeup_initiator \n"); 1569 } else { 1570 if (*ret != MDMNE_ACK) { 1571 commd_debug(MD_MMV_SYSLOG, "proc_mas: " 1572 "wakeup_initiator returned %d\n", *ret); 1573 } 1574 free(ret); 1575 } 1576 free_msg(msg); 1577 1578 if (err == MDMNE_LOG_FAIL) { 1579 /* we can't proceed here */ 1580 free_result(result); 1581 (void) rw_unlock(&set_desc_rwlock[setno]); 1582 return; 1583 } else if (err == MDMNE_CLASS_BUSY) { 1584 mdmn_changelog_record_t *lr; 1585 lr = mdmn_get_changelogrec(setno, orig_class); 1586 assert(lr != NULL); 1587 1588 /* proceed with the logged message */ 1589 msg = copy_msg(&(lr->lr_msg), NULL); 1590 1591 /* 1592 * The logged message has to have the same class but 1593 * type and sender can be different 1594 */ 1595 orig_type = msgtype = msg->msg_type; 1596 sender = msg->msg_sender; 1597 1598 commd_debug(MD_MMV_PROC_M, 1599 "proc_mas: Got new message from change log: " 1600 "(%d,0x%llx-%d) type %d\n", 1601 MSGID_ELEMS(msg->msg_msgid), msgtype); 1602 1603 /* continue normal operation with this message */ 1604 } 1605 } 1606 1607 proceed: 1608 smgen = mdmn_get_submessage_generator(msgtype); 1609 if (smgen == NULL) { 1610 /* no submessages to create, just use the original message */ 1611 msglist[0] = msg; 1612 nmsgs = 1; 1613 } else { 1614 /* some bits are passed on to submessages */ 1615 inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS; 1616 1617 nmsgs = smgen(msg, msglist); 1618 1619 /* some settings for the submessages */ 1620 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1621 cmsg = msglist[curmsg]; 1622 1623 /* Apply the inherited flags */ 1624 cmsg->msg_flags |= inherit_flags; 1625 1626 /* 1627 * Make sure the submessage ID is set correctly 1628 * Note: first submessage has mid_smid of 1 (not 0) 1629 */ 1630 cmsg->msg_msgid.mid_smid = curmsg + 1; 1631 1632 /* need the original class set in msgID (for MCT) */ 1633 cmsg->msg_msgid.mid_oclass = orig_class; 1634 } 1635 1636 commd_debug(MD_MMV_PROC_M, 1637 "smgen generated %d submsgs, origclass = %d\n", 1638 nmsgs, orig_class); 1639 } 1640 /* 1641 * This big loop does the following. 1642 * For all messages: 1643 * process message on the master first (a message completion 1644 * table MCT ensures a message is not processed twice) 1645 * in case of an error break out of message loop 1646 * for all nodes -- unless MD_MSGF_NO_BCAST is set -- 1647 * send message to node until that succeeds 1648 * merge result -- not yet implemented 1649 * respect MD_MSGF_STOP_ON_ERROR 1650 */ 1651 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1652 int break_msg_loop = 0; 1653 mutex_t *mx; /* protection for class_busy */ 1654 int master_err; 1655 int master_exitval = -1; 1656 1657 cmsg = msglist[curmsg]; 1658 msgtype = cmsg->msg_type; 1659 class = mdmn_get_message_class(msgtype); 1660 node = NULL; 1661 mx = mdmn_get_master_table_mx(setno, class); 1662 1663 /* If we are in the abort state, we error out immediately */ 1664 if (md_commd_global_state & MD_CGS_ABORTED) { 1665 break; /* out of the message loop */ 1666 } 1667 1668 commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n", 1669 class, orig_class); 1670 /* 1671 * If the current class is different from the original class, 1672 * we have to lock it down. 1673 * The original class is already marked busy. 1674 * At this point we cannot refuse the message because the 1675 * class is busy right now, so we wait until the class becomes 1676 * available again. As soon as something changes for this set 1677 * we will be cond_signal'ed (in mdmn_mark_class_unbusy) 1678 * 1679 * Granularity could be finer (setno/class) 1680 */ 1681 if (class != orig_class) { 1682 (void) mutex_lock(&mdmn_busy_mutex[setno]); 1683 while (mdmn_mark_class_busy(setno, class) == FALSE) { 1684 (void) cond_wait(&mdmn_busy_cv[setno], 1685 &mdmn_busy_mutex[setno]); 1686 } 1687 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 1688 } 1689 1690 master_err = do_message_locally(cmsg, result); 1691 1692 if ((master_err != MDMNE_ACK) || 1693 ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) { 1694 result->mmr_failing_node = set_master; 1695 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1696 /* 1697 * if appropriate, unbusy the class and 1698 * break out of the message loop 1699 */ 1700 if (class != orig_class) { 1701 (void) mutex_lock( 1702 &mdmn_busy_mutex[setno]); 1703 mdmn_mark_class_unbusy(setno, class); 1704 (void) mutex_unlock( 1705 &mdmn_busy_mutex[setno]); 1706 } 1707 break; 1708 } 1709 } 1710 1711 if (master_err == MDMNE_ACK) 1712 master_exitval = result->mmr_exitval; 1713 1714 /* No broadcast? => next message */ 1715 if (cmsg->msg_flags & MD_MSGF_NO_BCAST) { 1716 /* if appropriate, unbusy the class */ 1717 if (class != orig_class) { 1718 (void) mutex_lock(&mdmn_busy_mutex[setno]); 1719 mdmn_mark_class_unbusy(setno, class); 1720 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 1721 } 1722 continue; 1723 } 1724 1725 1726 /* fake sender, so we get notified when the results are avail */ 1727 cmsg->msg_sender = set_master; 1728 /* 1729 * register to the master_table. It's needed by wakeup_master to 1730 * wakeup the sleeping thread. 1731 * Access is protected by the class lock: mdmn_mark_class_busy() 1732 */ 1733 mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid)); 1734 1735 1736 1737 (void) rw_rdlock(&set_desc_rwlock[setno]); 1738 /* Send the message to all other nodes */ 1739 for (node = set_descriptor[setno]->sd_nodelist; node; 1740 node = node->nd_next) { 1741 md_mn_nodeid_t nid = node->nd_nodeid; 1742 1743 /* We are master and have already processed the msg */ 1744 if (node == set_descriptor[setno]->sd_mn_masternode) { 1745 continue; 1746 } 1747 1748 /* If this node didn't join the disk set, ignore it */ 1749 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 1750 continue; 1751 } 1752 1753 /* If a DIRECTED message, skip non-recipient nodes */ 1754 if ((cmsg->msg_flags & MD_MSGF_DIRECTED) && 1755 nid != cmsg->msg_recipient) { 1756 continue; 1757 } 1758 1759 (void) mutex_lock(mx); 1760 /* 1761 * Register the node that is addressed, 1762 * so we can detect unsolicited messages 1763 */ 1764 mdmn_set_master_table_addr(setno, class, nid); 1765 slave_result = (md_mn_result_t *)NULL; 1766 1767 /* 1768 * Now send it. do_send_message() will return if 1769 * a failure occurs or 1770 * the results are available 1771 */ 1772 err = do_send_message(cmsg, node); 1773 1774 /* in abort state, we error out immediately */ 1775 if (md_commd_global_state & MD_CGS_ABORTED) { 1776 break; 1777 } 1778 1779 if (err == MDMNE_ACK) { 1780 slave_result = 1781 mdmn_get_master_table_res(setno, class); 1782 commd_debug(MD_MMV_PROC_M, 1783 "proc_mas: got result for (%d,0x%llx-%d)\n", 1784 MSGID_ELEMS(cmsg->msg_msgid)); 1785 } else if (err == MDMNE_IGNORE_NODE) { 1786 (void) mutex_unlock(mx); 1787 continue; /* send to next node */ 1788 } 1789 (void) mutex_unlock(mx); 1790 1791 1792 /* 1793 * If the result is NULL, or err doesn't show success, 1794 * something went wrong with this RPC call. 1795 */ 1796 if ((slave_result == NULL) || (err != MDMNE_ACK)) { 1797 /* 1798 * If PANIC_WHEN_INCONSISTENT set, 1799 * panic if the master succeeded while 1800 * this node failed 1801 */ 1802 if ((cmsg->msg_flags & 1803 MD_MSGF_PANIC_WHEN_INCONSISTENT) && 1804 (master_err == MDMNE_ACK)) 1805 panic_system(nid, cmsg->msg_type, 1806 master_err, master_exitval, 1807 slave_result); 1808 1809 result->mmr_failing_node = nid; 1810 /* are we supposed to stop in case of error? */ 1811 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1812 result->mmr_exitval = MDMNE_RPC_FAIL; 1813 commd_debug(MD_MMV_SYSLOG, "proc_mas: " 1814 "result (%d,0x%llx-%d) is NULL\n", 1815 MSGID_ELEMS(cmsg->msg_msgid)); 1816 FLUSH_DEBUGFILE(); 1817 break_msg_loop = 1; 1818 break; /* out of node loop first */ 1819 } else { 1820 /* send msg to the next node */ 1821 continue; 1822 } 1823 1824 } 1825 1826 /* 1827 * Message processed on remote node. 1828 * If PANIC_WHEN_INCONSISTENT set, panic if the 1829 * result is different on this node from the result 1830 * on the master 1831 */ 1832 if ((cmsg->msg_flags & 1833 MD_MSGF_PANIC_WHEN_INCONSISTENT) && 1834 ((master_err != MDMNE_ACK) || 1835 (slave_result->mmr_exitval != master_exitval))) 1836 panic_system(nid, cmsg->msg_type, master_err, 1837 master_exitval, slave_result); 1838 1839 /* 1840 * At this point we know we have a message that was 1841 * processed on the remote node. 1842 * We now check if the exitval is non zero. 1843 * In that case we discard the previous result and 1844 * rather use the current. 1845 * This means: If a message fails on no node, 1846 * the result from the master will be returned. 1847 * There's currently no such thing as merge of results 1848 * If additionally STOP_ON_ERROR is set, we bail out 1849 */ 1850 if (slave_result->mmr_exitval != 0) { 1851 /* throw away the previously allocated result */ 1852 free_result(result); 1853 1854 /* copy_result() allocates new memory */ 1855 result = copy_result(slave_result); 1856 free_result(slave_result); 1857 1858 dump_result(MD_MMV_PROC_M, "proc_mas", result); 1859 1860 result->mmr_failing_node = nid; 1861 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1862 break_msg_loop = 1; 1863 break; /* out of node loop */ 1864 } 1865 continue; /* try next node */ 1866 1867 } else { 1868 /* 1869 * MNIssue: may want to merge the results 1870 * from all slaves. Currently only report 1871 * the results from the master. 1872 */ 1873 free_result(slave_result); 1874 } 1875 1876 } /* End of loop over the nodes */ 1877 (void) rw_unlock(&set_desc_rwlock[setno]); 1878 1879 1880 /* release the current class again */ 1881 if (class != orig_class) { 1882 (void) mutex_lock(&mdmn_busy_mutex[setno]); 1883 mdmn_mark_class_unbusy(setno, class); 1884 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 1885 } 1886 1887 /* are we supposed to quit entirely ? */ 1888 if (break_msg_loop || 1889 (md_commd_global_state & MD_CGS_ABORTED)) { 1890 break; /* out of msg loop */ 1891 } 1892 1893 } /* End of loop over the messages */ 1894 /* 1895 * If we are here, there's two possibilities: 1896 * - we processed all messages on all nodes without an error. 1897 * In this case we return the result from the master. 1898 * (to be implemented: return the merged result) 1899 * - we encountered an error in which case result has been 1900 * set accordingly already. 1901 */ 1902 1903 if (md_commd_global_state & MD_CGS_ABORTED) { 1904 result->mmr_comm_state = MDMNE_ABORT; 1905 } 1906 1907 /* 1908 * This message has been processed completely. 1909 * Remove it from the changelog. 1910 * Do this for replay messages too. 1911 * Note that the message is unlogged before waking up the 1912 * initiator. This is done for two reasons. 1913 * 1. Remove a race condition that occurs when back to back 1914 * messages are sent for the same class, the registeration is 1915 * is lost. 1916 * 2. If the initiator died but the action was completed on all the 1917 * the nodes, we want that to be marked "done" quickly. 1918 */ 1919 1920 if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) { 1921 commd_debug(MD_MMV_PROC_M, 1922 "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n", 1923 MSGID_ELEMS(msg->msg_msgid), msgtype); 1924 (void) mdmn_unlog_msg(msg); 1925 commd_debug(MD_MMV_PROC_M, 1926 "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n", 1927 MSGID_ELEMS(msg->msg_msgid), msgtype); 1928 } 1929 1930 /* 1931 * In case of submessages, we increased the submessage ID in the 1932 * result structure. We restore the message ID to the value that 1933 * the initiator is waiting for. 1934 */ 1935 result->mmr_msgid.mid_smid = 0; 1936 result->mmr_msgtype = orig_type; 1937 result->mmr_sender = set_master; 1938 1939 /* if we have an inited client, send result */ 1940 ret = (int *)NULL; 1941 1942 (void) rw_rdlock(&client_rwlock[setno]); 1943 if (check_client(setno, sender)) { 1944 commd_debug(MD_MMV_SYSLOG, 1945 "proc_mas: unable to create client for initiator\n"); 1946 } else { 1947 ret = mdmn_wakeup_initiator_2(result, client[setno][sender], 1948 sender); 1949 } 1950 (void) rw_unlock(&client_rwlock[setno]); 1951 1952 if (ret == (int *)NULL) { 1953 commd_debug(MD_MMV_PROC_M, 1954 "proc_mas: couldn't wakeup initiator\n"); 1955 } else { 1956 if (*ret != MDMNE_ACK) { 1957 commd_debug(MD_MMV_PROC_M, 1958 "proc_mas: wakeup_initiator returned %d\n", 1959 *ret); 1960 } 1961 free(ret); 1962 } 1963 1964 (void) rw_unlock(&set_desc_rwlock[setno]); 1965 /* Free all submessages, if there were any */ 1966 if (nmsgs > 1) { 1967 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1968 free_msg(msglist[curmsg]); 1969 } 1970 } 1971 /* Free the result */ 1972 free_result(result); 1973 1974 (void) mutex_lock(&mdmn_busy_mutex[setno]); 1975 mdmn_mark_class_unbusy(setno, orig_class); 1976 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 1977 1978 1979 /* 1980 * We use this ioctl just to get the time in the same format as used in 1981 * the messageID. If it fails, all we get is a bad runtime output. 1982 */ 1983 (void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL); 1984 secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32; 1985 usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff; 1986 1987 /* catching possible overflow */ 1988 if (usecdiff >= 1000000) { 1989 usecdiff -= 1000000; 1990 secdiff++; 1991 } 1992 1993 1994 commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d " 1995 "%5d.%06d secs runtime\n", 1996 MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff); 1997 1998 /* Free the original message */ 1999 free_msg(msg); 2000 } 2001 2002 void 2003 mdmn_slave_process_msg(md_mn_msg_t *msg) 2004 { 2005 int *ret = NULL; 2006 int completed; 2007 int retries; 2008 int successfully_returned; 2009 set_t setno; 2010 md_mn_result_t *result; 2011 md_mn_nodeid_t sender; 2012 md_mn_nodeid_t whoami; 2013 md_mn_msgtype_t msgtype; 2014 md_mn_msgclass_t class; 2015 2016 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 2017 2018 setno = msg->msg_setno; 2019 sender = msg->msg_sender; /* this is always the master of the set */ 2020 msgtype = msg->msg_type; 2021 2022 (void) rw_rdlock(&set_desc_rwlock[setno]); 2023 whoami = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 2024 (void) rw_unlock(&set_desc_rwlock[setno]); 2025 2026 result = Zalloc(sizeof (md_mn_result_t)); 2027 result->mmr_flags = msg->msg_flags; 2028 result->mmr_setno = setno; 2029 result->mmr_msgtype = msgtype; 2030 result->mmr_sender = whoami; 2031 result->mmr_comm_state = MDMNE_ACK; /* Ok state */ 2032 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 2033 class = mdmn_get_message_class(msgtype); 2034 2035 commd_debug(MD_MMV_PROC_S, 2036 "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 2037 MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype); 2038 2039 handler = mdmn_get_handler(msgtype); 2040 2041 if (handler == NULL) { 2042 result->mmr_exitval = 0; 2043 /* let the sender decide if this is an error or not */ 2044 result->mmr_comm_state = MDMNE_NO_HANDLER; 2045 commd_debug(MD_MMV_PROC_S, 2046 "proc_sla: No handler for (%d, 0x%llx-%d)\n", 2047 MSGID_ELEMS(msg->msg_msgid)); 2048 } else { 2049 2050 /* Did we already process this message ? */ 2051 (void) mutex_lock(&mct_mutex[setno][class]); 2052 completed = mdmn_check_completion(msg, result); 2053 2054 if (completed == MDMN_MCT_NOT_DONE) { 2055 /* message not yet processed locally */ 2056 commd_debug(MD_MMV_PROC_S, 2057 "proc_sla: calling handler for (%d, 0x%llx-%d)\n", 2058 MSGID_ELEMS(msg->msg_msgid)); 2059 2060 /* 2061 * Mark the message as being currently processed, 2062 * so we won't start a second handler for it 2063 */ 2064 (void) mdmn_mark_completion(msg, NULL, 2065 MDMN_MCT_IN_PROGRESS); 2066 2067 (void) mutex_unlock(&mct_mutex[setno][class]); 2068 (*handler)(msg, MD_MSGF_ON_SLAVE, result); 2069 2070 commd_debug(MD_MMV_PROC_S, 2071 "proc_sla: finished handler for (%d, 0x%llx-%d)\n", 2072 MSGID_ELEMS(msg->msg_msgid)); 2073 2074 (void) mutex_lock(&mct_mutex[setno][class]); 2075 /* Mark the message as fully done, store the result */ 2076 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 2077 2078 } else if (completed == MDMN_MCT_DONE) { 2079 /* message processed previously, got result from MCT */ 2080 commd_debug(MD_MMV_PROC_S, 2081 "proc_sla: result for (%d, 0x%llx-%d) from MCT\n", 2082 MSGID_ELEMS(msg->msg_msgid)); 2083 } else if (completed == MDMN_MCT_IN_PROGRESS) { 2084 /* 2085 * If the message is curruntly being processed, 2086 * we can return here, without sending a result back. 2087 * This will be done by the initial message handling 2088 * thread 2089 */ 2090 (void) mutex_unlock(&mct_mutex[setno][class]); 2091 commd_debug(MD_MMV_PROC_M, "proc_sla: " 2092 "(%d, 0x%llx-%d) is currently being processed\n", 2093 MSGID_ELEMS(msg->msg_msgid), msgtype); 2094 2095 free_msg(msg); 2096 free_result(result); 2097 return; 2098 } else { 2099 /* MCT error occurred (should never happen) */ 2100 result->mmr_comm_state = MDMNE_LOG_FAIL; 2101 commd_debug(MD_MMV_PROC_S, 2102 "proc_sla: MCT error for (%d, 0x%llx-%d)\n", 2103 MSGID_ELEMS(msg->msg_msgid)); 2104 } 2105 (void) mutex_unlock(&mct_mutex[setno][class]); 2106 } 2107 2108 /* 2109 * At this point we have a result (even in an error case) 2110 * that we return to the master. 2111 */ 2112 (void) rw_rdlock(&set_desc_rwlock[setno]); 2113 retries = 2; /* we will try two times to send the results */ 2114 successfully_returned = 0; 2115 2116 while (!successfully_returned && (retries != 0)) { 2117 ret = (int *)NULL; 2118 (void) rw_rdlock(&client_rwlock[setno]); 2119 if (check_client(setno, sender)) { 2120 /* 2121 * If we cannot setup the rpc connection to the master, 2122 * we can't do anything besides logging this fact. 2123 */ 2124 commd_debug(MD_MMV_SYSLOG, 2125 "proc_mas: unable to create client for master\n"); 2126 (void) rw_unlock(&client_rwlock[setno]); 2127 break; 2128 } else { 2129 ret = mdmn_wakeup_master_2(result, 2130 client[setno][sender], sender); 2131 /* 2132 * if mdmn_wakeup_master_2 returns NULL, it can be that 2133 * the master (or the commd on the master) had died. 2134 * In that case, we destroy the client to the master 2135 * and retry. 2136 * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK, 2137 * the commd on the master is alive but 2138 * something else is wrong, 2139 * in that case a retry doesn't make sense => break out 2140 */ 2141 if (ret == (int *)NULL) { 2142 commd_debug(MD_MMV_PROC_S, 2143 "proc_sla: wakeup_master returned NULL\n"); 2144 /* release reader lock, grab writer lock */ 2145 (void) rw_unlock(&client_rwlock[setno]); 2146 (void) rw_wrlock(&client_rwlock[setno]); 2147 mdmn_clnt_destroy(client[setno][sender]); 2148 if (client[setno][sender] != (CLIENT *)NULL) { 2149 client[setno][sender] = (CLIENT *)NULL; 2150 } 2151 (void) rw_unlock(&client_rwlock[setno]); 2152 retries--; 2153 commd_debug(MD_MMV_PROC_S, 2154 "retries = %d\n", retries); 2155 continue; 2156 } 2157 if (*ret != MDMNE_ACK) { 2158 commd_debug(MD_MMV_PROC_S, "proc_sla: " 2159 "wakeup_master returned %d\n", *ret); 2160 (void) rw_unlock(&client_rwlock[setno]); 2161 break; 2162 } else { /* Good case */ 2163 successfully_returned = 1; 2164 (void) rw_unlock(&client_rwlock[setno]); 2165 } 2166 } 2167 } 2168 2169 (void) rw_unlock(&set_desc_rwlock[setno]); 2170 commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n", 2171 MSGID_ELEMS(msg->msg_msgid)); 2172 2173 if (ret != (int *)NULL) 2174 free(ret); 2175 free_msg(msg); 2176 free_result(result); 2177 } 2178 2179 2180 /* 2181 * mdmn_send_svc_2: 2182 * --------------- 2183 * Check that the issuing node is a legitimate one (i.e. is licensed to send 2184 * messages to us), that the RPC request can be staged. 2185 * 2186 * Returns: 2187 * 0 => no RPC request is in-flight, no deferred svc_sendreply() 2188 * 1 => queued RPC request in-flight. Completion will be made (later) 2189 * by a wakeup_initiator_2() [hopefully] 2190 */ 2191 int 2192 mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp) 2193 { 2194 int err; 2195 set_t setno; 2196 SVCXPRT *transp = rqstp->rq_xprt; 2197 md_mn_msg_t *msg; 2198 md_mn_result_t *resultp; 2199 md_mn_msgclass_t class; 2200 md_mn_msg_and_transp_t *matp; 2201 2202 msg = copy_msg(omsg, NULL); 2203 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2204 2205 setno = msg->msg_setno; 2206 class = mdmn_get_message_class(msg->msg_type); 2207 2208 /* If we are in the abort state, we error out immediately */ 2209 if (md_commd_global_state & MD_CGS_ABORTED) { 2210 resultp = Zalloc(sizeof (md_mn_result_t)); 2211 resultp->mmr_comm_state = MDMNE_ABORT; 2212 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2213 free_result(resultp); 2214 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2215 return (0); 2216 } 2217 2218 /* check if the global initialization is done */ 2219 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2220 global_init(); 2221 } 2222 2223 commd_debug(MD_MMV_SEND, 2224 "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 2225 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 2226 2227 /* Check for verbosity related message */ 2228 if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 2229 md_mn_verbose_t *d; 2230 2231 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 2232 md_commd_global_verb = d->mmv_what; 2233 /* everytime the bitmask is set, we reset the timer */ 2234 __savetime = gethrtime(); 2235 /* 2236 * If local-only-flag is set, we are done here, 2237 * otherwise we pass that message on to the master. 2238 */ 2239 if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) { 2240 resultp = Zalloc(sizeof (md_mn_result_t)); 2241 resultp->mmr_comm_state = MDMNE_ACK; 2242 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 2243 (char *)resultp); 2244 free_result(resultp); 2245 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2246 return (0); 2247 } 2248 } 2249 2250 /* 2251 * Are we entering the abort state? 2252 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because 2253 * this message cannot be distributed anyway. 2254 * So, it's safe to return immediately. 2255 */ 2256 if (msg->msg_type == MD_MN_MSG_ABORT) { 2257 md_commd_global_state |= MD_CGS_ABORTED; 2258 resultp = Zalloc(sizeof (md_mn_result_t)); 2259 resultp->mmr_comm_state = MDMNE_ACK; 2260 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2261 free_result(resultp); 2262 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2263 return (0); 2264 } 2265 2266 2267 /* 2268 * Is this message type blocked? 2269 * If so we return MDMNE_CLASS_LOCKED, immediately 2270 */ 2271 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 2272 resultp = Zalloc(sizeof (md_mn_result_t)); 2273 resultp->mmr_comm_state = MDMNE_CLASS_LOCKED; 2274 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2275 free_result(resultp); 2276 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2277 commd_debug(MD_MMV_SEND, 2278 "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, " 2279 "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class, 2280 msg->msg_type); 2281 return (0); 2282 } 2283 2284 2285 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2286 /* Can only use the appropriate mutexes if they are inited */ 2287 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2288 (void) rw_wrlock(&set_desc_rwlock[setno]); 2289 (void) rw_wrlock(&client_rwlock[setno]); 2290 err = mdmn_init_set(setno, MDMN_SET_READY); 2291 (void) rw_unlock(&client_rwlock[setno]); 2292 (void) rw_unlock(&set_desc_rwlock[setno]); 2293 } else { 2294 err = mdmn_init_set(setno, MDMN_SET_READY); 2295 } 2296 2297 if (err) { 2298 /* couldn't initialize connections, cannot proceed */ 2299 resultp = Zalloc(sizeof (md_mn_result_t)); 2300 resultp->mmr_comm_state = err; 2301 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 2302 (char *)resultp); 2303 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2304 free_result(resultp); 2305 commd_debug(MD_MMV_SEND, 2306 "send: init err = %d\n", err); 2307 return (0); 2308 } 2309 } 2310 2311 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2312 if ((mdmn_is_class_suspended(setno, class) == TRUE) && 2313 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 2314 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2315 resultp = Zalloc(sizeof (md_mn_result_t)); 2316 resultp->mmr_comm_state = MDMNE_SUSPENDED; 2317 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2318 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2319 free_result(resultp); 2320 commd_debug(MD_MMV_SEND, 2321 "send: class suspended (%d, 0x%llx-%d), set=%d, " 2322 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 2323 setno, class, msg->msg_type); 2324 return (0); 2325 } 2326 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2327 2328 /* is this rpc request coming from the local node? */ 2329 if (check_license(rqstp, 0) == FALSE) { 2330 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2331 commd_debug(MD_MMV_SEND, 2332 "send: check licence fail(%d, 0x%llx-%d), set=%d, " 2333 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 2334 setno, class, msg->msg_type); 2335 return (0); 2336 } 2337 2338 2339 /* 2340 * We allocate a structure that can take two pointers in order to pass 2341 * both the message and the transp into thread_create. 2342 * The free for this alloc is done in mdmn_send_to_work() 2343 */ 2344 matp = Malloc(sizeof (md_mn_msg_and_transp_t)); 2345 matp->mat_msg = msg; 2346 matp->mat_transp = transp; 2347 2348 /* 2349 * create a thread here that calls work on the master. 2350 * If we are already on the master, this would block if running 2351 * in the same context. (our service is single threaded)( 2352 * Make it a detached thread because it will not communicate with 2353 * anybody thru thr_* mechanisms 2354 */ 2355 (void) thr_create(NULL, 0, mdmn_send_to_work, (void *) matp, 2356 THR_DETACHED, NULL); 2357 2358 commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n", 2359 MSGID_ELEMS(msg->msg_msgid)); 2360 /* 2361 * We return here without sending results. This will be done by 2362 * mdmn_wakeup_initiator_svc_2() as soon as the results are available. 2363 * Until then the calling send_message will be blocked, while we 2364 * are able to take calls. 2365 */ 2366 2367 return (1); 2368 } 2369 2370 /* ARGSUSED */ 2371 int * 2372 mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp) 2373 { 2374 int err; 2375 set_t setno; 2376 thread_t tid; 2377 int *retval; 2378 md_mn_msg_t *msg; 2379 md_mn_msgclass_t class; 2380 2381 retval = Malloc(sizeof (int)); 2382 2383 /* If we are in the abort state, we error out immediately */ 2384 if (md_commd_global_state & MD_CGS_ABORTED) { 2385 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2386 *retval = MDMNE_ABORT; 2387 return (retval); 2388 } 2389 2390 msg = copy_msg(omsg, NULL); 2391 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2392 2393 /* 2394 * Is this message type blocked? 2395 * If so we return MDMNE_CLASS_LOCKED, immediately. 2396 * This check is performed on master and slave. 2397 */ 2398 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 2399 *retval = MDMNE_CLASS_LOCKED; 2400 return (retval); 2401 } 2402 2403 /* check if the global initialization is done */ 2404 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2405 global_init(); 2406 } 2407 2408 class = mdmn_get_message_class(msg->msg_type); 2409 setno = msg->msg_setno; 2410 2411 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2412 /* Can only use the appropriate mutexes if they are inited */ 2413 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2414 (void) rw_wrlock(&set_desc_rwlock[setno]); 2415 (void) rw_wrlock(&client_rwlock[setno]); 2416 err = mdmn_init_set(setno, MDMN_SET_READY); 2417 (void) rw_unlock(&client_rwlock[setno]); 2418 (void) rw_unlock(&set_desc_rwlock[setno]); 2419 } else { 2420 err = mdmn_init_set(setno, MDMN_SET_READY); 2421 } 2422 2423 if (err) { 2424 *retval = MDMNE_CANNOT_CONNECT; 2425 free_msg(msg); 2426 return (retval); 2427 } 2428 } 2429 2430 /* is this rpc request coming from a licensed node? */ 2431 if (check_license(rqstp, msg->msg_sender) == FALSE) { 2432 free_msg(msg); 2433 *retval = MDMNE_RPC_FAIL; 2434 return (retval); 2435 } 2436 2437 commd_debug(MD_MMV_WORK, 2438 "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 2439 "flags=0x%x\n", 2440 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type, 2441 msg->msg_flags); 2442 2443 /* Check for various CLASS0 message types */ 2444 if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 2445 md_mn_verbose_t *d; 2446 2447 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 2448 /* for now we ignore set / class in md_mn_verbose_t */ 2449 md_commd_global_verb = d->mmv_what; 2450 /* everytime the bitmask is set, we reset the timer */ 2451 __savetime = gethrtime(); 2452 } 2453 2454 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2455 2456 /* check if class is locked via a call to mdmn_comm_lock_svc_2 */ 2457 if (mdmn_is_class_locked(setno, class) == TRUE) { 2458 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2459 *retval = MDMNE_CLASS_LOCKED; 2460 free_msg(msg); 2461 return (retval); 2462 } 2463 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2464 2465 /* Check if the class is busy right now. Do it only on the master */ 2466 (void) rw_rdlock(&set_desc_rwlock[setno]); 2467 if (set_descriptor[setno]->sd_mn_am_i_master) { 2468 (void) rw_unlock(&set_desc_rwlock[setno]); 2469 /* 2470 * If the class is currently suspended, don't accept new 2471 * messages, unless they are flagged with an override bit. 2472 */ 2473 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2474 if ((mdmn_is_class_suspended(setno, class) == TRUE) && 2475 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 2476 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2477 *retval = MDMNE_SUSPENDED; 2478 commd_debug(MD_MMV_SEND, 2479 "send: set %d is suspended\n", setno); 2480 free_msg(msg); 2481 return (retval); 2482 } 2483 if (mdmn_mark_class_busy(setno, class) == FALSE) { 2484 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2485 *retval = MDMNE_CLASS_BUSY; 2486 free_msg(msg); 2487 return (retval); 2488 } 2489 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2490 /* 2491 * Because the real processing of the message takes time we 2492 * create a thread for it. So the master thread can continue 2493 * to run and accept further messages. 2494 */ 2495 *retval = thr_create(NULL, 0, 2496 (void *(*)(void *))mdmn_master_process_msg, (void *)msg, 2497 THR_DETACHED|THR_SUSPENDED, &tid); 2498 } else { 2499 (void) rw_unlock(&set_desc_rwlock[setno]); 2500 *retval = thr_create(NULL, 0, 2501 (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg, 2502 THR_DETACHED|THR_SUSPENDED, &tid); 2503 } 2504 2505 if (*retval != 0) { 2506 *retval = MDMNE_THR_CREATE_FAIL; 2507 free_msg(msg); 2508 return (retval); 2509 } 2510 2511 /* Now run the new thread */ 2512 (void) thr_continue(tid); 2513 2514 commd_debug(MD_MMV_WORK, 2515 "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 2516 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 2517 2518 *retval = MDMNE_ACK; /* this means success */ 2519 return (retval); 2520 } 2521 2522 /* ARGSUSED */ 2523 int * 2524 mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp) 2525 { 2526 2527 int *retval; 2528 int err; 2529 set_t setno; 2530 mutex_t *mx; /* protection of initiator_table */ 2531 SVCXPRT *transp = NULL; 2532 md_mn_msgid_t initiator_table_id; 2533 md_mn_msgclass_t class; 2534 2535 retval = Malloc(sizeof (int)); 2536 2537 /* check if the global initialization is done */ 2538 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2539 global_init(); 2540 } 2541 2542 setno = res->mmr_setno; 2543 2544 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2545 /* set not ready means we just crashed are restarted now */ 2546 /* Can only use the appropriate mutexes if they are inited */ 2547 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2548 (void) rw_wrlock(&set_desc_rwlock[setno]); 2549 (void) rw_wrlock(&client_rwlock[setno]); 2550 err = mdmn_init_set(setno, MDMN_SET_READY); 2551 (void) rw_unlock(&client_rwlock[setno]); 2552 (void) rw_unlock(&set_desc_rwlock[setno]); 2553 } else { 2554 err = mdmn_init_set(setno, MDMN_SET_READY); 2555 } 2556 2557 if (err) { 2558 *retval = MDMNE_CANNOT_CONNECT; 2559 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2560 return (retval); 2561 } 2562 } 2563 2564 /* is this rpc request coming from a licensed node? */ 2565 if (check_license(rqstp, res->mmr_sender) == FALSE) { 2566 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2567 *retval = MDMNE_RPC_FAIL; 2568 return (retval); 2569 } 2570 2571 2572 class = mdmn_get_message_class(res->mmr_msgtype); 2573 mx = mdmn_get_initiator_table_mx(setno, class); 2574 2575 commd_debug(MD_MMV_WAKE_I, 2576 "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 2577 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype); 2578 2579 (void) mutex_lock(mx); 2580 2581 /* 2582 * Search the initiator wakeup table. 2583 * If we find an entry here (which should always be true) 2584 * we are on the initiating node and we wakeup the original 2585 * local rpc call. 2586 */ 2587 mdmn_get_initiator_table_id(setno, class, &initiator_table_id); 2588 2589 if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) { 2590 transp = mdmn_get_initiator_table_transp(setno, class); 2591 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res); 2592 svc_done(transp); 2593 mdmn_unregister_initiator_table(setno, class); 2594 *retval = MDMNE_ACK; 2595 2596 commd_debug(MD_MMV_WAKE_I, 2597 "wake_ini: replied (%d, 0x%llx-%d)\n", 2598 MSGID_ELEMS(res->mmr_msgid)); 2599 } else { 2600 commd_debug(MD_MMV_WAKE_I, 2601 "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n", 2602 MSGID_ELEMS(res->mmr_msgid)); 2603 *retval = MDMNE_NO_WAKEUP_ENTRY; 2604 } 2605 (void) mutex_unlock(mx); 2606 /* less work for check_timeouts */ 2607 (void) mutex_lock(&check_timeout_mutex); 2608 if (messages_on_their_way == 0) { 2609 commd_debug(MD_MMV_WAKE_I, 2610 "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n", 2611 MSGID_ELEMS(res->mmr_msgid)); 2612 } else { 2613 messages_on_their_way--; 2614 } 2615 (void) mutex_unlock(&check_timeout_mutex); 2616 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2617 2618 return (retval); 2619 } 2620 2621 2622 /* 2623 * res must be free'd by the thread we wake up 2624 */ 2625 /* ARGSUSED */ 2626 int * 2627 mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp) 2628 { 2629 2630 int *retval; 2631 int err; 2632 set_t setno; 2633 cond_t *cv; 2634 mutex_t *mx; 2635 md_mn_msgid_t master_table_id; 2636 md_mn_nodeid_t sender; 2637 md_mn_result_t *res; 2638 md_mn_msgclass_t class; 2639 2640 retval = Malloc(sizeof (int)); 2641 2642 /* check if the global initialization is done */ 2643 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2644 global_init(); 2645 } 2646 2647 /* Need to copy the results here, as they are static for RPC */ 2648 res = copy_result(ores); 2649 xdr_free(xdr_md_mn_result_t, (caddr_t)ores); 2650 2651 class = mdmn_get_message_class(res->mmr_msgtype); 2652 setno = res->mmr_setno; 2653 2654 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2655 /* set not ready means we just crashed are restarted now */ 2656 /* Can only use the appropriate mutexes if they are inited */ 2657 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2658 (void) rw_wrlock(&set_desc_rwlock[setno]); 2659 (void) rw_wrlock(&client_rwlock[setno]); 2660 err = mdmn_init_set(setno, MDMN_SET_READY); 2661 (void) rw_unlock(&client_rwlock[setno]); 2662 (void) rw_unlock(&set_desc_rwlock[setno]); 2663 } else { 2664 err = mdmn_init_set(setno, MDMN_SET_READY); 2665 } 2666 2667 if (err) { 2668 *retval = MDMNE_CANNOT_CONNECT; 2669 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2670 return (retval); 2671 } 2672 } 2673 2674 /* is this rpc request coming from a licensed node? */ 2675 if (check_license(rqstp, res->mmr_sender) == FALSE) { 2676 *retval = MDMNE_RPC_FAIL; 2677 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2678 return (retval); 2679 } 2680 2681 2682 commd_debug(MD_MMV_WAKE_M, 2683 "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d " 2684 "from %d\n", 2685 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype, 2686 res->mmr_sender); 2687 /* 2688 * The mutex and cv are needed for waking up the thread 2689 * sleeping in mdmn_master_process_msg() 2690 */ 2691 mx = mdmn_get_master_table_mx(setno, class); 2692 cv = mdmn_get_master_table_cv(setno, class); 2693 2694 /* 2695 * lookup the master wakeup table 2696 * If we find our message, we are on the master and 2697 * called by a slave that finished processing a message. 2698 * We store the results in the appropriate slot and 2699 * wakeup the thread (mdmn_master_process_msg()) waiting for them. 2700 */ 2701 (void) mutex_lock(mx); 2702 mdmn_get_master_table_id(setno, class, &master_table_id); 2703 sender = mdmn_get_master_table_addr(setno, class); 2704 2705 if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) { 2706 if (sender == res->mmr_sender) { 2707 mdmn_set_master_table_res(setno, class, res); 2708 (void) cond_signal(cv); 2709 *retval = MDMNE_ACK; 2710 } else { 2711 /* id is correct but wrong sender (I smell a timeout) */ 2712 commd_debug(MD_MMV_WAKE_M, 2713 "wakeup master got unsolicited message: " 2714 "(%d, 0x%llx-%d) from %d\n", 2715 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender); 2716 free_result(res); 2717 *retval = MDMNE_TIMEOUT; 2718 } 2719 } else { 2720 /* id is wrong, smells like a very late timeout */ 2721 commd_debug(MD_MMV_WAKE_M, 2722 "wakeup master got unsolicited message: " 2723 "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n", 2724 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender, 2725 MSGID_ELEMS(master_table_id)); 2726 free_result(res); 2727 *retval = MDMNE_NO_WAKEUP_ENTRY; 2728 } 2729 2730 (void) mutex_unlock(mx); 2731 2732 return (retval); 2733 } 2734 2735 /* 2736 * Lock a set/class combination. 2737 * This is mainly done for debug purpose. 2738 * This set/class combination immediately is blocked, 2739 * even in the middle of sending messages to multiple slaves. 2740 * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same 2741 * set/class combination. 2742 * 2743 * Special messages of class MD_MSG_CLASS0 can never be locked. 2744 * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT 2745 * 2746 * That means, if MD_MSG_CLASS0 is specified, we lock all classes from 2747 * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES 2748 * 2749 * set must be between 1 and MD_MAXSETS 2750 * class can be: 2751 * MD_MSG_CLASS0 which means all other classes in this case 2752 * or one specific class (< MD_MN_NCLASSES) 2753 * 2754 * Returns: 2755 * MDMNE_ACK on sucess (locking a locked class is Ok) 2756 * MDMNE_EINVAL if a parameter is out of range 2757 */ 2758 2759 /* ARGSUSED */ 2760 int * 2761 mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2762 { 2763 int *retval; 2764 set_t setno = msc->msc_set; 2765 md_mn_msgclass_t class = msc->msc_class; 2766 2767 retval = Malloc(sizeof (int)); 2768 2769 /* check if the global initialization is done */ 2770 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2771 global_init(); 2772 } 2773 2774 /* is this rpc request coming from the local node ? */ 2775 if (check_license(rqstp, 0) == FALSE) { 2776 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2777 *retval = MDMNE_RPC_FAIL; 2778 return (retval); 2779 } 2780 2781 /* Perform some range checking */ 2782 if ((setno == 0) || (setno >= MD_MAXSETS) || 2783 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 2784 *retval = MDMNE_EINVAL; 2785 return (retval); 2786 } 2787 2788 commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class); 2789 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2790 if (class != MD_MSG_CLASS0) { 2791 mdmn_mark_class_locked(setno, class); 2792 } else { 2793 /* MD_MSG_CLASS0 is used as a wild card for all classes */ 2794 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 2795 mdmn_mark_class_locked(setno, class); 2796 } 2797 } 2798 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2799 2800 *retval = MDMNE_ACK; 2801 return (retval); 2802 } 2803 2804 /* 2805 * Unlock a set/class combination. 2806 * set must be between 1 and MD_MAXSETS 2807 * class can be: 2808 * MD_MSG_CLASS0 which means all other classes in this case (like above) 2809 * or one specific class (< MD_MN_NCLASSES) 2810 * 2811 * Returns: 2812 * MDMNE_ACK on sucess (unlocking an unlocked class is Ok) 2813 * MDMNE_EINVAL if a parameter is out of range 2814 */ 2815 /* ARGSUSED */ 2816 int * 2817 mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2818 { 2819 int *retval; 2820 set_t setno = msc->msc_set; 2821 md_mn_msgclass_t class = msc->msc_class; 2822 2823 retval = Malloc(sizeof (int)); 2824 2825 /* check if the global initialization is done */ 2826 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2827 global_init(); 2828 } 2829 2830 /* is this rpc request coming from the local node ? */ 2831 if (check_license(rqstp, 0) == FALSE) { 2832 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2833 *retval = MDMNE_RPC_FAIL; 2834 return (retval); 2835 } 2836 2837 /* Perform some range checking */ 2838 if ((setno == 0) || (setno >= MD_MAXSETS) || 2839 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 2840 *retval = MDMNE_EINVAL; 2841 return (retval); 2842 } 2843 commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class); 2844 2845 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2846 if (class != MD_MSG_CLASS0) { 2847 mdmn_mark_class_unlocked(setno, class); 2848 } else { 2849 /* MD_MSG_CLASS0 is used as a wild card for all classes */ 2850 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 2851 mdmn_mark_class_unlocked(setno, class); 2852 } 2853 } 2854 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2855 2856 *retval = MDMNE_ACK; 2857 return (retval); 2858 } 2859 2860 /* 2861 * mdmn_comm_suspend_svc_2(setno, class) 2862 * 2863 * Drain all outstanding messages for a given set/class combination 2864 * and don't allow new messages to be processed. 2865 * 2866 * Special messages of class MD_MSG_CLASS0 can never be locked. 2867 * e.g. MD_MN_MSG_VERBOSITY 2868 * 2869 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 2870 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 2871 * 2872 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 2873 * one class as being suspended. 2874 * If messages for this class are currently on their way, 2875 * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned. 2876 * 2877 * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set. 2878 * Messages must be generated in ascending order. 2879 * This means, a message cannot create submessages with the same or lower class. 2880 * Draining messages must go from 1 to NCLASSES in order to ensure we don't 2881 * generate a hanging situation here. 2882 * We mark class 1 as being suspended. 2883 * if the class is not busy, we proceed with class 2 2884 * and so on 2885 * if a class *is* busy, we cannot continue here, but return 2886 * MDMNE_SET_NOT_DRAINED. 2887 * We expect the caller to hold on for some seconds and try again. 2888 * When that message, that held the class busy is done in 2889 * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called. 2890 * There it is checked if the class is about to drain. 2891 * In that case it tries to drain all higher classes there. 2892 * 2893 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 2894 * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are 2895 * completely drained. 2896 * 2897 * Returns: 2898 * MDMNE_ACK on sucess (set is drained, no outstanding messages) 2899 * MDMNE_SET_NOT_DRAINED if drain process is started, but there are 2900 * still outstanding messages for this set(s) 2901 * MDMNE_EINVAL if setno is out of range 2902 * MDMNE_NOT_JOINED if the set is not yet initialized on this node 2903 */ 2904 2905 /* ARGSUSED */ 2906 int * 2907 mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2908 { 2909 int *retval; 2910 int failure = 0; 2911 set_t startset, endset; 2912 set_t setno = msc->msc_set; 2913 md_mn_msgclass_t oclass = msc->msc_class; 2914 #ifdef NOT_YET_NEEDED 2915 uint_t flags = msc->msc_flags; 2916 #endif /* NOT_YET_NEEDED */ 2917 md_mn_msgclass_t class; 2918 2919 retval = Malloc(sizeof (int)); 2920 2921 /* check if the global initialization is done */ 2922 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2923 global_init(); 2924 } 2925 2926 /* is this rpc request coming from the local node ? */ 2927 if (check_license(rqstp, 0) == FALSE) { 2928 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2929 *retval = MDMNE_RPC_FAIL; 2930 return (retval); 2931 } 2932 2933 commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n", 2934 setno, oclass); 2935 2936 /* Perform some range checking */ 2937 if (setno >= MD_MAXSETS) { 2938 *retval = MDMNE_EINVAL; 2939 commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n"); 2940 return (retval); 2941 } 2942 2943 /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */ 2944 if (setno == MD_COMM_ALL_SETS) { 2945 startset = 1; 2946 endset = MD_MAXSETS - 1; 2947 } else { 2948 startset = setno; 2949 endset = setno; 2950 } 2951 2952 for (setno = startset; setno <= endset; setno++) { 2953 /* Here we need the mutexes for the set to be setup */ 2954 if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) { 2955 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 2956 } 2957 2958 (void) mutex_lock(&mdmn_busy_mutex[setno]); 2959 /* shall we drain all classes of this set? */ 2960 if (oclass == MD_COMM_ALL_CLASSES) { 2961 for (class = 1; class < MD_MN_NCLASSES; class ++) { 2962 commd_debug(MD_MMV_MISC, 2963 "suspend: suspending set %d, class %d\n", 2964 setno, class); 2965 *retval = mdmn_mark_class_suspended(setno, 2966 class, MDMN_SUSPEND_ALL); 2967 if (*retval == MDMNE_SET_NOT_DRAINED) { 2968 failure++; 2969 } 2970 } 2971 } else { 2972 /* only drain one specific class */ 2973 commd_debug(MD_MMV_MISC, 2974 "suspend: suspending set=%d class=%d\n", 2975 setno, oclass); 2976 *retval = mdmn_mark_class_suspended(setno, oclass, 2977 MDMN_SUSPEND_1); 2978 if (*retval == MDMNE_SET_NOT_DRAINED) { 2979 failure++; 2980 } 2981 } 2982 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 2983 } 2984 /* If one or more sets are not entirely drained, failure is non-zero */ 2985 if (failure != 0) { 2986 *retval = MDMNE_SET_NOT_DRAINED; 2987 commd_debug(MD_MMV_MISC, 2988 "suspend: returning MDMNE_SET_NOT_DRAINED\n"); 2989 } else { 2990 *retval = MDMNE_ACK; 2991 } 2992 2993 return (retval); 2994 } 2995 2996 /* 2997 * mdmn_comm_resume_svc_2(setno, class) 2998 * 2999 * Resume processing messages for a given set. 3000 * This incorporates the repeal of a previous suspend operation. 3001 * 3002 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 3003 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 3004 * 3005 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 3006 * one class as being resumed. 3007 * 3008 * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set. 3009 * 3010 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 3011 * 3012 * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also 3013 * reset any ABORT flag from the global state. 3014 * 3015 * Returns: 3016 * MDMNE_ACK on sucess (resuming an unlocked set is Ok) 3017 * MDMNE_EINVAL if setno is out of range 3018 * MDMNE_NOT_JOINED if the set is not yet initialized on this node 3019 */ 3020 /* ARGSUSED */ 3021 int * 3022 mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 3023 { 3024 int *retval; 3025 set_t startset, endset; 3026 set_t setno = msc->msc_set; 3027 md_mn_msgclass_t oclass = msc->msc_class; 3028 uint_t flags = msc->msc_flags; 3029 md_mn_msgclass_t class; 3030 3031 retval = Malloc(sizeof (int)); 3032 3033 /* check if the global initialization is done */ 3034 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 3035 global_init(); 3036 } 3037 3038 /* is this rpc request coming from the local node ? */ 3039 if (check_license(rqstp, 0) == FALSE) { 3040 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 3041 *retval = MDMNE_RPC_FAIL; 3042 return (retval); 3043 } 3044 3045 commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n", 3046 setno, oclass); 3047 3048 /* Perform some range checking */ 3049 if (setno > MD_MAXSETS) { 3050 *retval = MDMNE_EINVAL; 3051 return (retval); 3052 } 3053 3054 if (setno == MD_COMM_ALL_SETS) { 3055 startset = 1; 3056 endset = MD_MAXSETS - 1; 3057 if (oclass == MD_COMM_ALL_CLASSES) { 3058 /* This is the point where we "unabort" the commd */ 3059 commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n"); 3060 md_commd_global_state &= ~MD_CGS_ABORTED; 3061 } 3062 } else { 3063 startset = setno; 3064 endset = setno; 3065 } 3066 3067 for (setno = startset; setno <= endset; setno++) { 3068 3069 /* Here we need the mutexes for the set to be setup */ 3070 if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) { 3071 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 3072 } 3073 3074 (void) mutex_lock(&mdmn_busy_mutex[setno]); 3075 3076 if (oclass == MD_COMM_ALL_CLASSES) { 3077 int end_class = 1; 3078 /* 3079 * When SUSPENDing all classes, we go 3080 * from 1 to MD_MN_NCLASSES-1 3081 * The correct reverse action is RESUMing 3082 * from MD_MN_NCLASSES-1 to 1 (or 2) 3083 */ 3084 3085 if (flags & MD_MSCF_DONT_RESUME_CLASS1) { 3086 end_class = 2; 3087 } 3088 3089 /* 3090 * Then mark all classes of this set as no longer 3091 * suspended. This supersedes any previous suspend(1) 3092 * calls and resumes the set entirely. 3093 */ 3094 for (class = MD_MN_NCLASSES - 1; class >= end_class; 3095 class --) { 3096 commd_debug(MD_MMV_MISC, 3097 "resume: resuming set=%d class=%d\n", 3098 setno, class); 3099 mdmn_mark_class_resumed(setno, class, 3100 (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1)); 3101 } 3102 } else { 3103 /* 3104 * In this case only one class is marked as not 3105 * suspended. If a suspend(all) is currently active for 3106 * this set, this class will still be suspended. 3107 * That state will be cleared by a suspend(all) 3108 * (see above) 3109 */ 3110 commd_debug(MD_MMV_MISC, 3111 "resume: resuming set=%d class=%d\n", 3112 setno, oclass); 3113 mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1); 3114 } 3115 3116 (void) mutex_unlock(&mdmn_busy_mutex[setno]); 3117 } 3118 3119 *retval = MDMNE_ACK; 3120 return (retval); 3121 } 3122 /* ARGSUSED */ 3123 int * 3124 mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp) 3125 { 3126 int *retval; 3127 md_mnnode_desc *node; 3128 set_t setno = *setnop; 3129 3130 retval = Malloc(sizeof (int)); 3131 3132 /* check if the global initialization is done */ 3133 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 3134 global_init(); 3135 } 3136 3137 /* is this rpc request coming from the local node ? */ 3138 if (check_license(rqstp, 0) == FALSE) { 3139 xdr_free(xdr_set_t, (caddr_t)setnop); 3140 *retval = MDMNE_RPC_FAIL; 3141 return (retval); 3142 } 3143 3144 commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno); 3145 3146 (void) rw_rdlock(&set_desc_rwlock[setno]); 3147 /* 3148 * We assume, that all messages have been suspended previously. 3149 * 3150 * As we are modifying lots of clients here we grab the client_rwlock 3151 * in writer mode. This ensures, no new messages come in. 3152 */ 3153 (void) rw_wrlock(&client_rwlock[setno]); 3154 /* This set is no longer initialized */ 3155 3156 if ((set_descriptor[setno] != NULL) && 3157 (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 3158 /* destroy all rpc clients from this set */ 3159 for (node = set_descriptor[setno]->sd_nodelist; node; 3160 node = node->nd_next) { 3161 /* 3162 * Since the CLIENT for ourself will be recreated 3163 * shortly, and this node is guaranteed to be 3164 * there after a reconfig, there's no reason to go 3165 * through destroying it. It also avoids an issue 3166 * with calling clnt_create() later from within the 3167 * server thread, which can effectively deadlock 3168 * itself due to RPC design limitations. 3169 */ 3170 if (node == set_descriptor[setno]->sd_mn_mynode) 3171 continue; 3172 mdmn_clnt_destroy(client[setno][node->nd_nodeid]); 3173 if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) { 3174 client[setno][node->nd_nodeid] = (CLIENT *)NULL; 3175 } 3176 } 3177 md_mn_set_inited[setno] &= ~MDMN_SET_NODES; 3178 } 3179 3180 commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno); 3181 3182 (void) rw_unlock(&client_rwlock[setno]); 3183 (void) rw_unlock(&set_desc_rwlock[setno]); 3184 *retval = MDMNE_ACK; 3185 return (retval); 3186 } 3187 3188 /* 3189 * This is just an interface for testing purpose. 3190 * Here we can disable single message types. 3191 * If we block a message type, this is valid for all MN sets. 3192 * If a message arrives later, and it's message type is blocked, it will 3193 * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to 3194 * resend this message over and over again. 3195 */ 3196 3197 /* ARGSUSED */ 3198 int * 3199 mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp) 3200 { 3201 int *retval; 3202 md_mn_msgtype_t type = mmtl->mmtl_type; 3203 uint_t lock = mmtl->mmtl_lock; 3204 3205 retval = Malloc(sizeof (int)); 3206 3207 /* check if the global initialization is done */ 3208 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 3209 global_init(); 3210 } 3211 3212 /* is this rpc request coming from the local node ? */ 3213 if (check_license(rqstp, 0) == FALSE) { 3214 xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl); 3215 *retval = MDMNE_RPC_FAIL; 3216 return (retval); 3217 } 3218 3219 /* Perform some range checking */ 3220 if ((type == 0) || (type >= MD_MN_NMESSAGES)) { 3221 *retval = MDMNE_EINVAL; 3222 return (retval); 3223 } 3224 3225 commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock); 3226 msgtype_lock_state[type] = lock; 3227 3228 *retval = MDMNE_ACK; 3229 return (retval); 3230 } 3231