1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <unistd.h> 30 #include <sys/types.h> 31 #include <sys/stat.h> 32 #include <sys/statvfs.h> 33 #include <sys/uadmin.h> 34 #include <fcntl.h> 35 #include <stdio.h> 36 #include <thread.h> 37 #include <meta.h> 38 #include <sdssc.h> 39 #include <mdmn_changelog.h> 40 #include "mdmn_subr.h" 41 42 /* 43 * This is the communication daemon for SVM Multi Node Disksets. 44 * It runs on every node and provides the following rpc services: 45 * - mdmn_send_svc_1 46 * - mdmn_work_svc_1 47 * - mdmn_wakeup_initiator_svc_1 48 * - mdmn_wakeup_master_svc_1 49 * - mdmn_comm_lock_svc_1 50 * - mdmn_comm_unlock_svc_1 51 * - mdmn_comm_suspend_svc_1 52 * - mdmn_comm_resume_svc_1 53 * - mdmn_comm_reinit_set_svc_1 54 * where send, lock, unlock and reinit are meant for external use, 55 * work and the two wakeups are for internal use only. 56 * 57 * NOTE: 58 * On every node only one of those xxx_1 functions can be active at the 59 * same time because the daemon is single threaded. 60 * 61 * 62 * In case an event occurs that has to be propagated to all the nodes... 63 * 64 * One node (the initiator) 65 * calls the libmeta function mdmn_send_message() 66 * This function calls the local daemon thru mdmn_send_svc_1. 67 * 68 * On the initiator: 69 * mdmn_send_svc_1() 70 * - starts a thread -> mdmn_send_to_work() and returns. 71 * mdmn_send_to_work() 72 * - sends this message over to the master of the diskset. 73 * This is done by calling mdmn_work_svc_1 on the master. 74 * - registers to the initiator_table 75 * - exits without doing a svc_sendreply() for the call to 76 * mdmn_send_svc_1. This means that call is blocked until somebody 77 * (see end of this comment) does a svc_sendreply(). 78 * This means mdmn_send_message() does not yet return. 79 * - A timeout surveillance is started at this point. 80 * This means in case the master doesn't reply at all in an 81 * aproppriate time, an error condition is returned 82 * to the caller. 83 * 84 * On the master: 85 * mdmn_work_svc_1() 86 * - starts a thread -> mdmn_master_process_msg() and returns 87 * mdmn_master_process_msg() 88 * - logs the message to the change log 89 * - executes the message locally 90 * - flags the message in the change log 91 * - sends the message to mdmn_work_svc_1() on all the 92 * other nodes (slaves) 93 * after each call to mdmn_work_svc_1 the thread goes to sleep and 94 * will be woken up by mdmn_wakeup_master_svc_1() as soon as the 95 * slave node is done with this message. 96 * - In case the slave doesn't respond in a apropriate time, an error 97 * is assumed to ensure the master doesn't wait forever. 98 * 99 * On a slave: 100 * mdmn_work_svc_1() 101 * - starts a thread -> mdmn_slave_process_msg() and returns 102 * mdmn_slave_process_msg() 103 * - processes this message locally by calling the appropriate message 104 * handler, that creates some result. 105 * - sends that result thru a call to mdmn_wakeup_master_svc_1() to 106 * the master. 107 * 108 * Back on the master: 109 * mdmn_wakeup_master_svc_1() 110 * - stores the result into the master_table. 111 * - signals the mdmn_master_process_msg-thread. 112 * - returns 113 * mdmn_master_process_msg() 114 * - after getting the results from all nodes 115 * - sends them back to the initiating node thru a call to 116 * mdmn_wakeup_initiator_svc_1. 117 * 118 * Back on the initiator: 119 * mdmn_wakeup_initiator_svc_1() 120 * - calls svc_sendreply() which makes the call to mdmn_send_svc_1() 121 * return. 122 * which allows the initial mdmn_send_message() call to return. 123 */ 124 125 FILE *commdout; /* debug output for the commd */ 126 char *commdoutfile; /* file name for the above output */ 127 /* want at least 10 MB free space when logging into a file */ 128 #define MIN_FS_SPACE (10LL * 1024 * 1024) 129 130 /* 131 * Number of outstanding messages that were initiated by this node. 132 * If zero, check_timeouts goes to sleep 133 */ 134 uint_t messages_on_their_way; 135 mutex_t check_timeout_mutex; /* need mutex to protect above */ 136 cond_t check_timeout_cv; /* trigger for check_timeouts */ 137 138 /* for printing out time stamps */ 139 hrtime_t __savetime; 140 141 /* RPC clients for every set and every node and their protecting locks */ 142 CLIENT *client[MD_MAXSETS][NNODES]; 143 rwlock_t client_rwlock[MD_MAXSETS]; 144 145 /* the descriptors of all possible sets and their protectors */ 146 struct md_set_desc *set_descriptor[MD_MAXSETS]; 147 rwlock_t set_desc_rwlock[MD_MAXSETS]; 148 149 /* the daemon to daemon communication has to timeout quickly */ 150 static struct timeval FOUR_SECS = { 4, 0 }; 151 152 /* These indicate if a set has already been setup */ 153 int md_mn_set_inited[MD_MAXSETS]; 154 155 /* For every set we have a message completion table and protecting mutexes */ 156 md_mn_mct_t *mct[MD_MAXSETS]; 157 mutex_t mct_mutex[MD_MAXSETS][MD_MN_NCLASSES]; 158 159 /* Stuff to describe the global status of the commd on one node */ 160 #define MD_CGS_INITED 0x0001 161 #define MD_CGS_ABORTED 0x0002 /* return everything with MDMNE_ABORT */ 162 uint_t md_commd_global_state = 0; /* No state when starting up */ 163 164 /* 165 * Global verbosity level for the daemon 166 */ 167 uint_t md_commd_global_verb; 168 169 /* 170 * libmeta doesn't like multiple threads in metaget_setdesc(). 171 * So we must protect access to it with a global lock 172 */ 173 mutex_t get_setdesc_mutex; 174 175 /* 176 * Need a way to block single message types, 177 * hence an array with a status for every message type 178 */ 179 uint_t msgtype_lock_state[MD_MN_NMESSAGES]; 180 181 /* for reading in the config file */ 182 #define MAX_LINE_SIZE 1024 183 184 extern char *commd_get_outfile(void); 185 extern uint_t commd_get_verbosity(void); 186 187 /* 188 * mdmn_clnt_create is a helper function for meta_client_create_retry. It 189 * merely needs to call clnt_create_timed, and meta_client_create_retry 190 * will take care of the rest. 191 */ 192 /* ARGSUSED */ 193 static CLIENT * 194 mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out) 195 { 196 md_mnnode_desc *node = (md_mnnode_desc *)data; 197 198 return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp", 199 time_out)); 200 } 201 202 #define FLUSH_DEBUGFILE() \ 203 if (commdout != (FILE *)NULL) { \ 204 fflush(commdout); \ 205 fsync(fileno(commdout)); \ 206 } 207 208 static void 209 panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval, 210 md_mn_result_t *slave_result) 211 { 212 md_mn_commd_err_t commd_err; 213 md_error_t mne = mdnullerror; 214 char *msg_buf; 215 216 msg_buf = (char *)calloc(MAXPATHLEN + 1, sizeof (char)); 217 218 FLUSH_DEBUGFILE(); 219 220 if (master_err != MDMNE_ACK) { 221 snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master " 222 "when processing message type %d\n", type); 223 } else if (slave_result == NULL) { 224 snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node " 225 "%d when processing message type %d\n", nid, type); 226 } else { 227 snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent " 228 "return value from node %d when processing message " 229 "type %d. Master exitval = %d, Slave exitval = %d\n", 230 nid, type, master_exitval, slave_result->mmr_exitval); 231 } 232 commd_err.size = strlen(msg_buf); 233 commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0]; 234 235 metaioctl(MD_MN_COMMD_ERR, &commd_err, &mne, "rpc.mdcommd"); 236 (void) uadmin(A_DUMP, AD_BOOT, NULL); 237 } 238 239 static void 240 flush_fcout() 241 { 242 struct statvfs64 vfsbuf; 243 long long avail_bytes; 244 int warned = 0; 245 246 for (; ; ) { 247 sleep(10); 248 /* No output file, nothing to do */ 249 if (commdout == (FILE *)NULL) 250 continue; 251 252 /* 253 * stat the appropriate filesystem to check for available space. 254 */ 255 if (statvfs64(commdoutfile, &vfsbuf)) { 256 continue; 257 } 258 259 avail_bytes = vfsbuf.f_frsize * vfsbuf.f_bavail; 260 /* 261 * If we don't have enough space, we print out a warning. 262 * And we drop the verbosity level to NULL 263 * In case the condtion doesn't go away, we don't repeat 264 * the warning. 265 */ 266 if (avail_bytes < MIN_FS_SPACE) { 267 if (warned) { 268 continue; 269 } 270 commd_debug(MD_MMV_SYSLOG, 271 "NOT enough space available for logging\n"); 272 commd_debug(MD_MMV_SYSLOG, 273 "Have %lld bytes, need %lld bytes\n", 274 avail_bytes, MIN_FS_SPACE); 275 warned = 1; 276 md_commd_global_verb = MD_MMV_NULL; 277 } else { 278 warned = 0; 279 } 280 281 fflush(commdout); 282 } 283 } 284 285 /* safer version of clnt_destroy. If clnt is NULL don't do anything */ 286 #define mdmn_clnt_destroy(clnt) { \ 287 if (clnt) \ 288 clnt_destroy(clnt); \ 289 } 290 291 /* 292 * Own version of svc_sendreply that checks the integrity of the transport 293 * handle and so prevents us from core dumps in the real svc_sendreply() 294 */ 295 void 296 mdmn_svc_sendreply(SVCXPRT *transp, xdrproc_t xdr, caddr_t data) 297 { 298 if (SVC_STAT(transp) == XPRT_DIED) { 299 commd_debug(MD_MMV_MISC, 300 "mdmn_svc_sendreply: XPRT_DIED\n"); 301 return; 302 } 303 (void) svc_sendreply(transp, xdr, data); 304 } 305 306 /* 307 * timeout_initiator(set, class) 308 * 309 * Alas, I sent a message and didn't get a response back in aproppriate time. 310 * 311 * timeout_initiator() takes care for doing the needed svc_sendreply() to the 312 * calling mdmn_send_message, so that guy doesn't wait forever 313 * What is done here is pretty much the same as what is done in 314 * wakeup initiator. The difference is that we cannot provide for any results, 315 * of course and we set the comm_state to MDMNE_TIMEOUT. 316 * 317 * By doing so, mdmn_send_message can decide if a retry would make sense or not. 318 * It's not our's to decide that here. 319 */ 320 void 321 timeout_initiator(set_t setno, md_mn_msgclass_t class) 322 { 323 SVCXPRT *transp; 324 md_mn_msgid_t mid; 325 md_mn_result_t *resultp; 326 327 resultp = Zalloc(sizeof (md_mn_result_t)); 328 resultp->mmr_comm_state = MDMNE_TIMEOUT; 329 330 commd_debug(MD_MMV_MISC, 331 "timeout_initiator set = %d, class = %d\n", setno, class); 332 333 transp = mdmn_get_initiator_table_transp(setno, class); 334 mdmn_get_initiator_table_id(setno, class, &mid); 335 336 commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n", 337 MSGID_ELEMS(mid)); 338 339 /* return to mdmn_send_message() and let it deal with the situation */ 340 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 341 342 free(resultp); 343 commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n"); 344 mdmn_unregister_initiator_table(setno, class); 345 } 346 347 348 /* 349 * check_timeouts - thread 350 * 351 * This implements a timeout surveillance for messages sent from the 352 * initiator to the master. 353 * 354 * If a message is started, this thread is triggered thru 355 * cond_signal(&check_timeout_cv) and we keep track of the numbers of 356 * messages that are outstanding (messages_on_their_way). 357 * 358 * As long as there are messages on their way, this thread never goes to sleep. 359 * It'll keep checking all class/set combinations for outstanding messages. 360 * If one is found, it's checked if this message is overdue. In that case, 361 * timeout_initiator() is called to wakeup the calling mdmn_send_message and 362 * to clean up the mess. 363 * 364 * If the result from the master arrives later, this message is considered 365 * to be unsolicited. And will be ignored. 366 */ 367 368 void 369 check_timeouts() 370 { 371 set_t setno; 372 time_t now, then; 373 mutex_t *mx; 374 md_mn_msgclass_t class; 375 376 for (; ; ) { 377 now = time((time_t *)NULL); 378 for (setno = 1; setno < MD_MAXSETS; setno++) { 379 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 380 continue; 381 } 382 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; 383 class++) { 384 mx = mdmn_get_initiator_table_mx(setno, class); 385 mutex_lock(mx); 386 387 /* then is the registered time */ 388 then = 389 mdmn_get_initiator_table_time(setno, class); 390 if ((then != 0) && (now > then)) { 391 timeout_initiator(setno, class); 392 } 393 mutex_unlock(mx); 394 } 395 } 396 /* it's ok to check only once per second */ 397 sleep(1); 398 399 /* is there work to do? */ 400 mutex_lock(&check_timeout_mutex); 401 if (messages_on_their_way == 0) { 402 cond_wait(&check_timeout_cv, &check_timeout_mutex); 403 } 404 mutex_unlock(&check_timeout_mutex); 405 } 406 } 407 408 void 409 setup_debug(void) 410 { 411 char *tmp_dir; 412 413 /* Read in the debug-controlling tokens from runtime.cf */ 414 md_commd_global_verb = commd_get_verbosity(); 415 /* 416 * If the user didn't specify a verbosity level in runtime.cf 417 * we can safely return here. As we don't intend to printout 418 * debug messages, we don't need to check for the output file. 419 */ 420 if (md_commd_global_verb == 0) { 421 return; 422 } 423 424 /* if commdout is non-NULL it is an open FILE, we'd better close it */ 425 if (commdout != (FILE *)NULL) { 426 fclose(commdout); 427 } 428 429 commdoutfile = commd_get_outfile(); 430 431 /* setup the debug output */ 432 if (commdoutfile == (char *)NULL) { 433 /* if no valid file was specified, use the default */ 434 commdoutfile = "/var/run/commd.out"; 435 commdout = fopen(commdoutfile, "a"); 436 } else { 437 /* check if the directory exists and is writable */ 438 tmp_dir = strdup(commdoutfile); 439 if ((access(dirname(tmp_dir), X_OK|W_OK)) || 440 ((commdout = fopen(commdoutfile, "a")) == (FILE *)NULL)) { 441 syslog(LOG_ERR, 442 "Can't write to specified output file %s,\n" 443 "using /var/run/commd.out instead\n", commdoutfile); 444 free(commdoutfile); 445 commdoutfile = "/var/run/commd.out"; 446 commdout = fopen(commdoutfile, "a"); 447 } 448 free(tmp_dir); 449 } 450 451 if (commdout == (FILE *)NULL) { 452 syslog(LOG_ERR, "Can't write to debug output file %s\n", 453 commdoutfile); 454 } 455 } 456 457 /* 458 * mdmn_is_node_dead checks to see if a node is dead using 459 * the SunCluster infrastructure which is a stable interface. 460 * If unable to contact SunCuster the node is assumed to be alive. 461 * Return values: 462 * 1 - node is dead 463 * 0 - node is alive 464 */ 465 int 466 mdmn_is_node_dead(md_mnnode_desc *node) 467 { 468 char *fmt = "/usr/cluster/bin/scha_cluster_get -O NODESTATE_NODE "; 469 char *cmd; 470 size_t size; 471 char buf[10]; 472 FILE *ptr; 473 int retval = 0; 474 475 /* I know that I'm alive */ 476 if (strcmp(node->nd_nodename, mynode()) == 0) 477 return (retval); 478 479 size = strlen(fmt) + strlen(node->nd_nodename) + 1; 480 cmd = Zalloc(size); 481 (void) strlcat(cmd, fmt, size); 482 (void) strlcat(cmd, node->nd_nodename, size); 483 484 if ((ptr = popen(cmd, "r")) != NULL) { 485 if (fgets(buf, sizeof (buf), ptr) != NULL) { 486 /* If scha_cluster_get returned DOWN - return dead */ 487 if (strncmp(buf, "DOWN", 4) == 0) 488 retval = 1; 489 } 490 (void) pclose(ptr); 491 } 492 Free(cmd); 493 return (retval); 494 } 495 496 /* 497 * global_init() 498 * 499 * Perform some global initializations. 500 * 501 * the following routines have to call this before operation can start: 502 * - mdmn_send_svc_1 503 * - mdmn_work_svc_1 504 * - mdmn_comm_lock_svc_1 505 * - mdmn_comm_unlock_svc_1 506 * - mdmn_comm_suspend_svc_1 507 * - mdmn_comm_resume_svc_1 508 * - mdmn_comm_reinit_set_svc_1 509 * 510 * This is a single threaded daemon, so it can only be in one of the above 511 * routines at the same time. 512 * This means, global_init() cannot be called more than once at the same time. 513 * Hence, no lock is needed. 514 */ 515 void 516 global_init(void) 517 { 518 set_t set; 519 md_mn_msgclass_t class; 520 struct sigaction sighandler; 521 time_t clock_val; 522 523 /* Do these global initializations only once */ 524 if (md_commd_global_state & MD_CGS_INITED) { 525 return; 526 } 527 (void) sdssc_bind_library(); 528 529 /* setup the debug options from the config file */ 530 setup_debug(); 531 532 /* Make setup_debug() be the action in case of SIGHUP */ 533 sighandler.sa_flags = 0; 534 sigfillset(&sighandler.sa_mask); 535 sighandler.sa_handler = (void (*)(int)) setup_debug; 536 sigaction(SIGHUP, &sighandler, NULL); 537 538 __savetime = gethrtime(); 539 (void) time(&clock_val); 540 commd_debug(MD_MMV_MISC, "global init called %s\n", 541 ctime(&clock_val)); 542 543 /* start a thread that flushes out the debug on a regular basis */ 544 thr_create(NULL, 0, (void *(*)(void *))flush_fcout, 545 (void *) NULL, THR_DETACHED, NULL); 546 547 /* global rwlock's / mutex's / cond_t's go here */ 548 mutex_init(&check_timeout_mutex, USYNC_THREAD, NULL); 549 cond_init(&check_timeout_cv, USYNC_THREAD, NULL); 550 mutex_init(&get_setdesc_mutex, USYNC_THREAD, NULL); 551 552 /* Make sure the initiator table is initialized correctly */ 553 for (set = 0; set < MD_MAXSETS; set++) { 554 for (class = 0; class < MD_MN_NCLASSES; class++) { 555 mdmn_unregister_initiator_table(set, class); 556 } 557 } 558 559 560 /* setup the check for timeouts */ 561 thr_create(NULL, 0, (void *(*)(void *))check_timeouts, 562 (void *) NULL, THR_DETACHED, NULL); 563 564 md_commd_global_state |= MD_CGS_INITED; 565 } 566 567 568 /* 569 * mdmn_init_client(setno, nodeid) 570 * called if client[setno][nodeid] is NULL 571 * 572 * NOTE: Must be called with set_desc_rwlock held as a reader 573 * NOTE: Must be called with client_rwlock held as a writer 574 * 575 * If the rpc client for this node has not been setup for any set, we do it now. 576 * 577 * Returns 0 on success (node found in set, rpc client setup) 578 * -1 if metaget_setdesc failed, 579 * -2 if node not part of set 580 * -3 if clnt_create fails 581 */ 582 static int 583 mdmn_init_client(set_t setno, md_mn_nodeid_t nid) 584 { 585 md_error_t ep = mdnullerror; 586 md_mnnode_desc *node; 587 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 588 589 sd = set_descriptor[setno]; 590 591 /* 592 * Is the appropriate set_descriptor already initialized ? 593 * Can't think of a scenario where this is not the case, but we'd better 594 * check for it anyway. 595 */ 596 if (sd == NULL) { 597 mdsetname_t *sp; 598 599 rw_unlock(&set_desc_rwlock[setno]); /* readlock -> writelock */ 600 rw_wrlock(&set_desc_rwlock[setno]); 601 sp = metasetnosetname(setno, &ep); 602 /* Only one thread is supposed to be in metaget_setdesc() */ 603 mutex_lock(&get_setdesc_mutex); 604 sd = metaget_setdesc(sp, &ep); 605 mutex_unlock(&get_setdesc_mutex); 606 if (sd == NULL) { 607 rw_unlock(&set_desc_rwlock[setno]); /* back to ... */ 608 rw_rdlock(&set_desc_rwlock[setno]); /* ... readlock */ 609 return (-1); 610 } 611 set_descriptor[setno] = sd; 612 rw_unlock(&set_desc_rwlock[setno]); /* back to readlock */ 613 rw_rdlock(&set_desc_rwlock[setno]); 614 } 615 616 /* first we have to find the node name for this node id */ 617 for (node = sd->sd_nodelist; node; node = node->nd_next) { 618 if (node->nd_nodeid == nid) 619 break; /* we found our node in this set */ 620 } 621 622 623 if (node == (md_mnnode_desc *)NULL) { 624 commd_debug(MD_MMV_SYSLOG, 625 "FATAL: node %d not found in set %d\n", nid, setno); 626 rw_unlock(&set_desc_rwlock[setno]); 627 return (-2); 628 } 629 630 commd_debug(MD_MMV_INIT, "init: %s has the flags: 0x%x\n", 631 node->nd_nodename ? node->nd_nodename : "NULL", node->nd_flags); 632 633 /* Did this node join the diskset? */ 634 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 635 commd_debug(MD_MMV_INIT, "init: %s didn't join set %d\n", 636 node->nd_nodename ? node->nd_nodename : "NULL", setno); 637 rw_unlock(&set_desc_rwlock[setno]); 638 return (-2); 639 } 640 641 /* if clnt_create has not been done for that node, do it now */ 642 if (client[setno][nid] == (CLIENT *) NULL) { 643 time_t tout = 0; 644 645 /* 646 * While trying to create a connection to a node, 647 * periodically check to see if the node has been marked 648 * dead by the SunCluster infrastructure. 649 * This periodic check is needed since a non-responsive 650 * rpc.mdcommd (while it is attempting to create a connection 651 * to a dead node) can lead to large delays and/or failures 652 * in the reconfig steps. 653 */ 654 while ((client[setno][nid] == (CLIENT *) NULL) && 655 (tout < MD_CLNT_CREATE_TOUT)) { 656 client[setno][nid] = meta_client_create_retry 657 (node->nd_nodename, mdmn_clnt_create, 658 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 659 /* Is the node dead? */ 660 if (mdmn_is_node_dead(node) == 1) { 661 commd_debug(MD_MMV_SYSLOG, 662 "rpc.mdcommd: no client for dead node %s\n", 663 node->nd_nodename); 664 break; 665 } else 666 tout += MD_CLNT_CREATE_SUBTIMEOUT; 667 } 668 669 if (client[setno][nid] == (CLIENT *) NULL) { 670 clnt_pcreateerror(node->nd_nodename); 671 rw_unlock(&set_desc_rwlock[setno]); 672 return (-3); 673 } 674 /* this node has the license to send */ 675 commd_debug(MD_MMV_MISC, "init_client: calling add_lic\n"); 676 add_license(node); 677 678 /* set the timeout value */ 679 clnt_control(client[setno][nid], CLSET_TIMEOUT, 680 (char *)&FOUR_SECS); 681 682 } 683 rw_unlock(&set_desc_rwlock[setno]); 684 return (0); 685 } 686 687 /* 688 * check_client(setno, nodeid) 689 * 690 * must be called with reader lock held for set_desc_rwlock[setno] 691 * and must be called with reader lock held for client_rwlock[setno] 692 * Checks if the client for this set/node combination is already setup 693 * if not it upgrades the lock to a writer lock 694 * and tries to initialize the client. 695 * Finally it's checked if the client nulled out again due to some race 696 * 697 * returns 0 if there is a usable client 698 * returns MDMNE_RPC_FAIL otherwise 699 */ 700 static int 701 check_client(set_t setno, md_mn_nodeid_t nodeid) 702 { 703 int ret = 0; 704 705 while ((client[setno][nodeid] == (CLIENT *)NULL) && (ret == 0)) { 706 rw_unlock(&client_rwlock[setno]); /* upgrade reader ... */ 707 rw_wrlock(&client_rwlock[setno]); /* ... to writer lock. */ 708 if (mdmn_init_client(setno, nodeid) != 0) { 709 ret = MDMNE_RPC_FAIL; 710 } 711 rw_unlock(&client_rwlock[setno]); /* downgrade writer ... */ 712 rw_rdlock(&client_rwlock[setno]); /* ... back to reader lock. */ 713 } 714 return (ret); 715 } 716 717 /* 718 * mdmn_init_set(setno, todo) 719 * setno is the number of the set to be initialized. 720 * todo is one of the MDMN_SET_* thingies or MDMN_SET_READY 721 * If called with MDMN_SET_READY everything is initialized. 722 * 723 * If the set mutexes are already initialized, the caller has to hold 724 * both set_desc_rwlock[setno] and client_rwlock[setno] as a writer, before 725 * calling mdmn_init_set() 726 */ 727 int 728 mdmn_init_set(set_t setno, int todo) 729 { 730 int class; 731 md_mnnode_desc *node; 732 md_set_desc *sd; /* just an abbr for set_descriptor[setno] */ 733 mdsetname_t *sp; 734 md_error_t ep = mdnullerror; 735 md_mn_nodeid_t nid; 736 737 /* 738 * Check if we are told to setup the mutexes and 739 * if these are not yet setup 740 */ 741 if ((todo & MDMN_SET_MUTEXES) && 742 ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0)) { 743 mutex_init(&mdmn_busy_mutex[setno], USYNC_THREAD, NULL); 744 cond_init(&mdmn_busy_cv[setno], USYNC_THREAD, NULL); 745 rwlock_init(&client_rwlock[setno], USYNC_THREAD, NULL); 746 rwlock_init(&set_desc_rwlock[setno], USYNC_THREAD, NULL); 747 748 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 749 mutex_init(mdmn_get_master_table_mx(setno, class), 750 USYNC_THREAD, NULL); 751 cond_init(mdmn_get_master_table_cv(setno, class), 752 USYNC_THREAD, NULL); 753 mutex_init(mdmn_get_initiator_table_mx(setno, class), 754 USYNC_THREAD, NULL); 755 } 756 md_mn_set_inited[setno] |= MDMN_SET_MUTEXES; 757 } 758 if ((todo & MDMN_SET_MCT) && 759 ((md_mn_set_inited[setno] & MDMN_SET_MCT) == 0)) { 760 int fd; 761 size_t filesize; 762 caddr_t addr; 763 char table_name[32]; 764 765 filesize = (sizeof (md_mn_mct_t)); 766 (void) snprintf(table_name, sizeof (table_name), "%s%d", 767 MD_MN_MSG_COMP_TABLE, setno); 768 /* 769 * If the mct file exists we map it into memory. 770 * Otherwise we create an empty file of appropriate 771 * size and map that into memory. 772 * The mapped areas are stored in mct[setno]. 773 */ 774 fd = open(table_name, O_RDWR|O_CREAT|O_DSYNC, 0600); 775 if (fd < 0) { 776 commd_debug(MD_MMV_MISC, 777 "init_set: Can't open MCT\n"); 778 return (-1); 779 } 780 /* 781 * To ensure that the file has the appropriate size, 782 * we write a byte at the end of the file. 783 */ 784 lseek(fd, filesize + 1, SEEK_SET); 785 write(fd, "\0", 1); 786 787 /* at this point we have a file in place that we can mmap */ 788 addr = mmap(0, filesize, PROT_READ | PROT_WRITE, 789 MAP_SHARED, fd, (off_t)0); 790 if (addr == MAP_FAILED) { 791 commd_debug(MD_MMV_INIT, 792 "init_set: mmap mct error %d\n", 793 errno); 794 return (-1); 795 } 796 /* LINTED pointer alignment */ 797 mct[setno] = (md_mn_mct_t *)addr; 798 799 /* finally we initialize the mutexes that protect the mct */ 800 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 801 mutex_init(&(mct_mutex[setno][class]), 802 USYNC_THREAD, NULL); 803 } 804 805 md_mn_set_inited[setno] |= MDMN_SET_MCT; 806 } 807 /* 808 * Check if we are told to setup the nodes and 809 * if these are not yet setup 810 * (Attention: negative logic here compared to above!) 811 */ 812 if (((todo & MDMN_SET_NODES) == 0) || 813 (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 814 return (0); /* success */ 815 } 816 817 if ((sp = metasetnosetname(setno, &ep)) == NULL) { 818 commd_debug(MD_MMV_SYSLOG, 819 "metasetnosetname(%d) returned NULL\n", setno); 820 return (MDMNE_NOT_JOINED); 821 } 822 823 /* flush local copy of rpc.metad data */ 824 metaflushsetname(sp); 825 826 mutex_lock(&get_setdesc_mutex); 827 sd = metaget_setdesc(sp, &ep); 828 mutex_unlock(&get_setdesc_mutex); 829 830 if (sd == NULL) { 831 commd_debug(MD_MMV_SYSLOG, 832 "metaget_setdesc(%d) returned NULL\n", setno); 833 return (MDMNE_NOT_JOINED); 834 } 835 836 /* 837 * if this set is not a multinode set or 838 * this node didn't join yet the diskset, better don't do anything 839 */ 840 if ((MD_MNSET_DESC(sd) == 0) || 841 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN) == 0) { 842 commd_debug(MD_MMV_INIT, "didn't yet join set %d\n", setno); 843 return (MDMNE_NOT_JOINED); 844 } 845 846 for (node = sd->sd_nodelist; node != NULL; node = node->nd_next) { 847 time_t tout = 0; 848 nid = node->nd_nodeid; 849 850 commd_debug(MD_MMV_INIT, 851 "setting up: node=%s, priv_ic=%s, flags=0x%x\n", 852 node->nd_nodename ? node->nd_nodename : "NULL", 853 node->nd_priv_ic ? node->nd_priv_ic : "NULL", 854 node->nd_flags); 855 856 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 857 commd_debug(MD_MMV_INIT, 858 "init: %s didn't join set %d\n", 859 node->nd_nodename ? node->nd_nodename : "NULL", 860 setno); 861 continue; 862 } 863 864 if (client[setno][nid] != (CLIENT *) NULL) { 865 /* already inited */ 866 commd_debug(MD_MMV_INIT, "init: already: node=%s\n", 867 node->nd_nodename ? node->nd_nodename : "NULL"); 868 continue; 869 } 870 871 /* 872 * While trying to create a connection to a node, 873 * periodically check to see if the node has been marked 874 * dead by the SunCluster infrastructure. 875 * This periodic check is needed since a non-responsive 876 * rpc.mdcommd (while it is attempting to create a connection 877 * to a dead node) can lead to large delays and/or failures 878 * in the reconfig steps. 879 */ 880 while ((client[setno][nid] == (CLIENT *) NULL) && 881 (tout < MD_CLNT_CREATE_TOUT)) { 882 client[setno][nid] = meta_client_create_retry 883 (node->nd_nodename, mdmn_clnt_create, 884 (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep); 885 /* Is the node dead? */ 886 if (mdmn_is_node_dead(node) == 1) { 887 commd_debug(MD_MMV_SYSLOG, 888 "rpc.mdcommd: no client for dead node %s\n", 889 node->nd_nodename); 890 break; 891 } else 892 tout += MD_CLNT_CREATE_SUBTIMEOUT; 893 } 894 895 if (client[setno][nid] == (CLIENT *) NULL) { 896 clnt_pcreateerror(node->nd_nodename); 897 /* 898 * If we cannot connect to a single node 899 * (maybe because it is down) we mark this node as not 900 * owned and continue with the next node in the list. 901 * This is better than failing the entire starting up 902 * of the commd system. 903 */ 904 node->nd_flags &= ~MD_MN_NODE_OWN; 905 commd_debug(MD_MMV_SYSLOG, 906 "WARNING couldn't create client for %s\n" 907 "Reconfig cycle required\n", 908 node->nd_nodename); 909 commd_debug(MD_MMV_INIT, 910 "WARNING couldn't create client for %s\n" 911 "Reconfig cycle required\n", 912 node->nd_nodename); 913 continue; 914 } 915 /* this node has the license to send */ 916 commd_debug(MD_MMV_MISC, "init_set: calling add_lic\n"); 917 add_license(node); 918 919 /* set the timeout value */ 920 clnt_control(client[setno][nid], CLSET_TIMEOUT, 921 (char *)&FOUR_SECS); 922 923 commd_debug(MD_MMV_INIT, "init: done: node=%s\n", 924 node->nd_nodename ? node->nd_nodename : "NULL"); 925 } 926 927 set_descriptor[setno] = sd; 928 md_mn_set_inited[setno] |= MDMN_SET_NODES; 929 return (0); /* success */ 930 } 931 932 void * 933 mdmn_send_to_work(void *arg) 934 { 935 int *rpc_err; 936 int success; 937 int try_master; 938 set_t setno; 939 mutex_t *mx; /* protection for initiator_table */ 940 SVCXPRT *transp; 941 md_mn_msg_t *msg; 942 md_mn_nodeid_t set_master; 943 md_mn_msgclass_t class; 944 md_mn_msg_and_transp_t *matp = (md_mn_msg_and_transp_t *)arg; 945 946 msg = matp->mat_msg; 947 transp = matp->mat_transp; 948 949 /* the alloc was done in mdmn_send_svc_1 */ 950 free(matp); 951 952 class = mdmn_get_message_class(msg->msg_type); 953 setno = msg->msg_setno; 954 955 /* set the sender, so the master knows who to send the results */ 956 rw_rdlock(&set_desc_rwlock[setno]); 957 msg->msg_sender = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 958 set_master = set_descriptor[setno]->sd_mn_master_nodeid; 959 960 mx = mdmn_get_initiator_table_mx(setno, class); 961 mutex_lock(mx); 962 963 /* 964 * Here we check, if the initiator table slot for this set/class 965 * combination is free to use. 966 * If this is not the case, we return CLASS_BUSY forcing the 967 * initiating send_message call to retry 968 */ 969 success = mdmn_check_initiator_table(setno, class); 970 if (success == MDMNE_CLASS_BUSY) { 971 md_mn_msgid_t active_mid; 972 973 mdmn_get_initiator_table_id(setno, class, 974 &active_mid); 975 976 commd_debug(MD_MMV_SEND, 977 "send_to_work: received but locally busy " 978 "(%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 979 "active msg=(%d, 0x%llx-%d)\n", 980 MSGID_ELEMS(msg->msg_msgid), setno, class, 981 msg->msg_type, MSGID_ELEMS(active_mid)); 982 } else { 983 commd_debug(MD_MMV_SEND, 984 "send_to_work: received (%d, 0x%llx-%d), " 985 "set=%d, class=%d, type=%d\n", 986 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 987 } 988 989 try_master = 2; /* return failure after two retries */ 990 while ((success == MDMNE_ACK) && (try_master--)) { 991 rw_rdlock(&client_rwlock[setno]); 992 /* is the rpc client to the master still around ? */ 993 if (check_client(setno, set_master)) { 994 success = MDMNE_RPC_FAIL; 995 FLUSH_DEBUGFILE(); 996 rw_unlock(&client_rwlock[setno]); 997 break; /* out of try_master-loop */ 998 } 999 1000 /* 1001 * Send the request to the work function on the master 1002 * this call will return immediately 1003 */ 1004 rpc_err = mdmn_work_1(msg, client[setno][set_master]); 1005 1006 /* Everything's Ok? */ 1007 if (rpc_err == NULL) { 1008 success = MDMNE_RPC_FAIL; 1009 /* 1010 * Probably something happened to the daemon on the 1011 * master. Kill the client, and try again... 1012 */ 1013 rw_unlock(&client_rwlock[setno]); 1014 rw_wrlock(&client_rwlock[setno]); 1015 mdmn_clnt_destroy(client[setno][set_master]); 1016 if (client[setno][set_master] != (CLIENT *)NULL) { 1017 client[setno][set_master] = (CLIENT *)NULL; 1018 } 1019 rw_unlock(&client_rwlock[setno]); 1020 continue; 1021 1022 } else if (*rpc_err != MDMNE_ACK) { 1023 /* something went wrong, break out */ 1024 success = *rpc_err; 1025 free(rpc_err); 1026 rw_unlock(&client_rwlock[setno]); 1027 break; /* out of try_master-loop */ 1028 } 1029 1030 rw_unlock(&client_rwlock[setno]); 1031 free(rpc_err); 1032 1033 /* 1034 * If we are here, we sucessfully delivered the message. 1035 * We register the initiator_table, so that 1036 * wakeup_initiator_1 can do the sendreply with the 1037 * results for us. 1038 */ 1039 success = MDMNE_ACK; 1040 mdmn_register_initiator_table(setno, class, msg, transp); 1041 1042 /* tell check_timeouts, there's work to do */ 1043 mutex_lock(&check_timeout_mutex); 1044 messages_on_their_way++; 1045 cond_signal(&check_timeout_cv); 1046 mutex_unlock(&check_timeout_mutex); 1047 break; /* out of try_master-loop */ 1048 } 1049 1050 rw_unlock(&set_desc_rwlock[setno]); 1051 1052 if (success == MDMNE_ACK) { 1053 commd_debug(MD_MMV_SEND, 1054 "send_to_work: registered (%d, 0x%llx-%d)\n", 1055 MSGID_ELEMS(msg->msg_msgid)); 1056 } else { 1057 /* In case of failure do the sendreply now */ 1058 md_mn_result_t *resultp; 1059 resultp = Zalloc(sizeof (md_mn_result_t)); 1060 resultp->mmr_comm_state = success; 1061 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 1062 commd_debug(MD_MMV_SEND, 1063 "send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n", 1064 MSGID_ELEMS(msg->msg_msgid), success); 1065 free_result(resultp); 1066 1067 } 1068 1069 free_msg(msg); 1070 mutex_unlock(mx); 1071 return (NULL); 1072 1073 } 1074 1075 /* 1076 * do_message_locally(msg, result) 1077 * Process a message locally on the master 1078 * Lookup the MCT if the message has already been processed. 1079 * If not, call the handler and store the result 1080 * If yes, retrieve the result from the MCT. 1081 * Return: 1082 * MDMNE_ACK in case of success 1083 * MDMNE_LOG_FAIL if the MCT could not be checked 1084 */ 1085 static int 1086 do_message_locally(md_mn_msg_t *msg, md_mn_result_t *result) 1087 { 1088 int completed; 1089 set_t setno; 1090 md_mn_msgtype_t msgtype = msg->msg_type; 1091 md_mn_msgclass_t class; 1092 1093 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 1094 1095 handler = mdmn_get_handler(msgtype); 1096 if (handler == NULL) { 1097 result->mmr_exitval = 0; 1098 /* let the sender decide if this is an error or not */ 1099 result->mmr_comm_state = MDMNE_NO_HANDLER; 1100 return (MDMNE_NO_HANDLER); 1101 } 1102 1103 class = mdmn_get_message_class(msg->msg_type); 1104 setno = msg->msg_setno; 1105 1106 result->mmr_msgtype = msgtype; 1107 result->mmr_flags = msg->msg_flags; 1108 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 1109 1110 mutex_lock(&mct_mutex[setno][class]); 1111 completed = mdmn_check_completion(msg, result); 1112 if (completed == MDMN_MCT_NOT_DONE) { 1113 /* message not yet processed locally */ 1114 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1115 "calling handler for (%d,0x%llx-%d) type %d\n", 1116 MSGID_ELEMS(msg->msg_msgid), msgtype); 1117 1118 /* 1119 * Mark the message as being currently processed, 1120 * so we won't start a second handler for it 1121 */ 1122 (void) mdmn_mark_completion(msg, NULL, MDMN_MCT_IN_PROGRESS); 1123 mutex_unlock(&mct_mutex[setno][class]); 1124 1125 /* here we actually process the message on the master */ 1126 (*handler)(msg, MD_MSGF_ON_MASTER, result); 1127 1128 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1129 "finished handler for (%d,0x%llx-%d) type %d\n", 1130 MSGID_ELEMS(msg->msg_msgid), msgtype); 1131 1132 /* Mark the message as fully processed, store the result */ 1133 mutex_lock(&mct_mutex[setno][class]); 1134 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 1135 } else if (completed == MDMN_MCT_DONE) { 1136 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1137 "result for (%d, 0x%llx-%d) from MCT\n", 1138 MSGID_ELEMS(msg->msg_msgid), msgtype); 1139 } else if (completed == MDMN_MCT_IN_PROGRESS) { 1140 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1141 "(%d, 0x%llx-%d) is currently being processed\n", 1142 MSGID_ELEMS(msg->msg_msgid), msgtype); 1143 } else { 1144 /* MCT error occurred (should never happen) */ 1145 mutex_unlock(&mct_mutex[setno][class]); 1146 result->mmr_comm_state = MDMNE_LOG_FAIL; 1147 commd_debug(MD_MMV_SYSLOG, "WARNING " 1148 "mdmn_check_completion returned %d " 1149 "for (%d,0x%llx-%d)\n", completed, 1150 MSGID_ELEMS(msg->msg_msgid)); 1151 return (MDMNE_LOG_FAIL); 1152 } 1153 mutex_unlock(&mct_mutex[setno][class]); 1154 return (MDMNE_ACK); 1155 1156 } 1157 1158 /* 1159 * do_send_message(msg, node) 1160 * 1161 * Send a message to a given node and wait for a acknowledgment, that the 1162 * message has arrived on the remote node. 1163 * Make sure that the client for the set is setup correctly. 1164 * If no ACK arrives, destroy and recreate the RPC client and retry the 1165 * message one time 1166 * After actually sending wait no longer than the appropriate number of 1167 * before timing out the message. 1168 * 1169 * Note must be called with set_desc_wrlock held in reader mode 1170 */ 1171 static int 1172 do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node) 1173 { 1174 int err; 1175 int rpc_retries; 1176 int timeout_retries = 0; 1177 int *ret = NULL; 1178 set_t setno; 1179 cond_t *cv; /* see mdmn_wakeup_master_svc_1 */ 1180 mutex_t *mx; /* protection for class_busy */ 1181 timestruc_t timeout; /* surveillance for remote daemon */ 1182 md_mn_nodeid_t nid; 1183 md_mn_msgtype_t msgtype; 1184 md_mn_msgclass_t class; 1185 1186 nid = node->nd_nodeid; 1187 msgtype = msg->msg_type; 1188 setno = msg->msg_setno; 1189 class = mdmn_get_message_class(msgtype); 1190 mx = mdmn_get_master_table_mx(setno, class); 1191 cv = mdmn_get_master_table_cv(setno, class); 1192 1193 retry_rpc: 1194 1195 /* We try two times to send the message */ 1196 rpc_retries = 2; 1197 1198 /* 1199 * if sending the message doesn't succeed the first time due to a 1200 * RPC problem, we retry one time 1201 */ 1202 while ((rpc_retries != 0) && (ret == NULL)) { 1203 /* in abort state, we error out immediately */ 1204 if (md_commd_global_state & MD_CGS_ABORTED) { 1205 return (MDMNE_ABORT); 1206 } 1207 1208 rw_rdlock(&client_rwlock[setno]); 1209 /* unable to create client? Ignore it */ 1210 if (check_client(setno, nid)) { 1211 /* 1212 * In case we cannot establish an RPC client, we 1213 * take this node out of our considerations. 1214 * This will be reset by a reconfig 1215 * cycle that should come pretty soon. 1216 * MNISSUE: Should a reconfig cycle 1217 * be forced on SunCluster? 1218 */ 1219 node->nd_flags &= ~MD_MN_NODE_OWN; 1220 commd_debug(MD_MMV_SYSLOG, 1221 "WARNING couldn't create client for %s\n" 1222 "Reconfig cycle required\n", 1223 node->nd_nodename); 1224 commd_debug(MD_MMV_PROC_M, "proc_mas: (%d,0x%llx-%d) " 1225 "WARNING couldn't create client for %s\n", 1226 MSGID_ELEMS(msg->msg_msgid), node->nd_nodename); 1227 rw_unlock(&client_rwlock[setno]); 1228 return (MDMNE_IGNORE_NODE); 1229 } 1230 /* let's be paranoid and check again before sending */ 1231 if (client[setno][nid] == NULL) { 1232 /* 1233 * if this is true, strange enough, we catch our breath, 1234 * and then continue, so that the client is set up 1235 * once again. 1236 */ 1237 commd_debug(MD_MMV_PROC_M, "client is NULL\n"); 1238 rw_unlock(&client_rwlock[setno]); 1239 sleep(1); 1240 continue; 1241 } 1242 1243 /* send it over, it will return immediately */ 1244 ret = mdmn_work_1(msg, client[setno][nid]); 1245 1246 rw_unlock(&client_rwlock[setno]); 1247 1248 if (ret != NULL) { 1249 commd_debug(MD_MMV_PROC_M, 1250 "proc_mas: sending (%d,0x%llx-%d) to %d returned " 1251 " 0x%x\n", 1252 MSGID_ELEMS(msg->msg_msgid), nid, *ret); 1253 } else { 1254 commd_debug(MD_MMV_PROC_M, 1255 "proc_mas: sending (%d,0x%llx-%d) to %d returned " 1256 " NULL \n", 1257 MSGID_ELEMS(msg->msg_msgid), nid); 1258 } 1259 1260 if ((ret == NULL) || (*ret == MDMNE_CANNOT_CONNECT) || 1261 (*ret == MDMNE_THR_CREATE_FAIL)) { 1262 /* 1263 * Something happened to the daemon on the other side. 1264 * Kill the client, and try again. 1265 * check_client() will create a new client 1266 */ 1267 rw_wrlock(&client_rwlock[setno]); 1268 mdmn_clnt_destroy(client[setno][nid]); 1269 if (client[setno][nid] != (CLIENT *)NULL) { 1270 client[setno][nid] = (CLIENT *)NULL; 1271 } 1272 rw_unlock(&client_rwlock[setno]); 1273 1274 /* ... but don't try infinitely */ 1275 --rpc_retries; 1276 continue; 1277 } 1278 /* 1279 * If the class is locked on the other node, keep trying. 1280 * This situation will go away automatically, 1281 * if we wait long enough 1282 */ 1283 if (*ret == MDMNE_CLASS_LOCKED) { 1284 sleep(1); 1285 free(ret); 1286 ret = NULL; 1287 continue; 1288 } 1289 } 1290 if (ret == NULL) { 1291 return (MDMNE_RPC_FAIL); 1292 } 1293 1294 1295 /* if the slave is in abort state, we just ignore it. */ 1296 if (*ret == MDMNE_ABORT) { 1297 commd_debug(MD_MMV_PROC_M, 1298 "proc_mas: work(%d,0x%llx-%d) returned " 1299 "MDMNE_ABORT\n", 1300 MSGID_ELEMS(msg->msg_msgid)); 1301 free(ret); 1302 return (MDMNE_IGNORE_NODE); 1303 } 1304 1305 /* Did the remote processing succeed? */ 1306 if (*ret != MDMNE_ACK) { 1307 /* 1308 * Some commd failure in the middle of sending the msg 1309 * to the nodes. We don't continue here. 1310 */ 1311 commd_debug(MD_MMV_PROC_M, 1312 "proc_mas: work(%d,0x%llx-%d) returns %d\n", 1313 MSGID_ELEMS(msg->msg_msgid), *ret); 1314 free(ret); 1315 return (MDMNE_RPC_FAIL); 1316 } 1317 free(ret); 1318 ret = NULL; 1319 1320 /* 1321 * When we are here, we have sent the message to the other node and 1322 * we know that node has accepted it. 1323 * We go to sleep and have trust to be woken up by wakeup. 1324 * If we wakeup due to a timeout, or a signal, no result has been 1325 * placed in the appropriate slot. 1326 * If we timeout, it is likely that this is because the node has 1327 * gone away, so we will destroy the client and try it again in the 1328 * expectation that the rpc will fail and we will return 1329 * MDMNE_IGNORE_NODE. If that is not the case, the message must still 1330 * be being processed on the slave. In this case just timeout for 4 1331 * more seconds and then return RPC_FAIL if the message is not complete. 1332 */ 1333 timeout.tv_nsec = 0; 1334 timeout.tv_sec = (timeout_retries == 0) ? mdmn_get_timeout(msgtype) : 1335 FOUR_SECS.tv_sec; 1336 err = cond_reltimedwait(cv, mx, &timeout); 1337 1338 if (err == 0) { 1339 /* everything's fine, return success */ 1340 return (MDMNE_ACK); 1341 } 1342 1343 if (err == ETIME) { 1344 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1345 "timeout occured, set=%d, class=%d, " 1346 "msgid=(%d, 0x%llx-%d), timeout_retries=%d\n", 1347 setno, class, MSGID_ELEMS(msg->msg_msgid), timeout_retries); 1348 if (timeout_retries == 0) { 1349 timeout_retries++; 1350 /* 1351 * Destroy the client and try the rpc call again 1352 */ 1353 rw_wrlock(&client_rwlock[setno]); 1354 mdmn_clnt_destroy(client[setno][nid]); 1355 client[setno][nid] = (CLIENT *)NULL; 1356 rw_unlock(&client_rwlock[setno]); 1357 goto retry_rpc; 1358 } 1359 } else if (err == EINTR) { 1360 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1361 "commd signalled, set=%d, class=%d, " 1362 "msgid=(%d, 0x%llx-%d)\n", 1363 setno, class, MSGID_ELEMS(msg->msg_msgid)); 1364 } else { 1365 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1366 "cond_reltimedwait err=%d, set=%d, " 1367 "class=%d, msgid=(%d, 0x%llx-%d)\n", 1368 err, setno, class, 1369 MSGID_ELEMS(msg->msg_msgid)); 1370 } 1371 1372 /* some failure happened */ 1373 return (MDMNE_RPC_FAIL); 1374 } 1375 1376 /* 1377 * before we return we have to 1378 * free_msg(msg); because we are working on a copied message 1379 */ 1380 void 1381 mdmn_master_process_msg(md_mn_msg_t *msg) 1382 { 1383 int *ret; 1384 int err; 1385 int nmsgs; /* total number of msgs */ 1386 int curmsg; /* index of current msg */ 1387 set_t setno; 1388 uint_t inherit_flags = 0; 1389 uint_t secdiff, usecdiff; /* runtime of this message */ 1390 md_error_t mde = mdnullerror; 1391 md_mn_msg_t *msglist[MAX_SUBMESSAGES]; /* all msgs to process */ 1392 md_mn_msg_t *cmsg; /* current msg */ 1393 md_mn_msgid_t dummyid; 1394 md_mn_result_t *result; 1395 md_mn_result_t *slave_result; 1396 md_mn_nodeid_t sender; 1397 md_mn_nodeid_t set_master; 1398 md_mnnode_desc *node; 1399 md_mn_msgtype_t orig_type; /* type of the original message */ 1400 md_mn_msgtype_t msgtype; /* type of the current message */ 1401 md_mn_msgclass_t orig_class; /* class of the original message */ 1402 md_mn_msgclass_t class; /* class of the current message */ 1403 1404 int (*smgen)(md_mn_msg_t *msg, md_mn_msg_t **msglist); 1405 1406 orig_type = msgtype = msg->msg_type; 1407 sender = msg->msg_sender; 1408 setno = msg->msg_setno; 1409 1410 result = Zalloc(sizeof (md_mn_result_t)); 1411 result->mmr_setno = setno; 1412 result->mmr_msgtype = msgtype; 1413 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 1414 1415 orig_class = mdmn_get_message_class(msgtype); 1416 1417 commd_debug(MD_MMV_PROC_M, 1418 "proc_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 1419 MSGID_ELEMS(msg->msg_msgid), setno, orig_class, msgtype); 1420 1421 rw_rdlock(&set_desc_rwlock[setno]); 1422 set_master = set_descriptor[setno]->sd_mn_master_nodeid; 1423 result->mmr_sender = set_master; 1424 /* 1425 * Put message into the change log unless told otherwise 1426 * Note that we only log original messages. 1427 * If they are generated by some smgen, we don't log them! 1428 * Replay messages aren't logged either. 1429 * Note, that replay messages are unlogged on completion. 1430 */ 1431 if ((msg->msg_flags & (MD_MSGF_NO_LOG | MD_MSGF_REPLAY_MSG)) == 0) { 1432 commd_debug(MD_MMV_PROC_M, 1433 "proc_mas: calling log_msg for (%d,0x%llx-%d) type %d\n", 1434 MSGID_ELEMS(msg->msg_msgid), msgtype); 1435 err = mdmn_log_msg(msg); 1436 if (err == MDMNE_NULL) { 1437 /* msg logged successfully */ 1438 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1439 "done log_msg for (%d,0x%llx-%d) type %d\n", 1440 MSGID_ELEMS(msg->msg_msgid), msgtype); 1441 goto proceed; 1442 } 1443 if (err == MDMNE_ACK) { 1444 /* Same msg in the slot, proceed */ 1445 commd_debug(MD_MMV_PROC_M, "proc_mas: " 1446 "already logged (%d,0x%llx-%d) type %d\n", 1447 MSGID_ELEMS(msg->msg_msgid), msgtype); 1448 goto proceed; 1449 } 1450 if (err == MDMNE_LOG_FAIL) { 1451 /* Oh, bad, the log is non functional. */ 1452 result->mmr_comm_state = MDMNE_LOG_FAIL; 1453 /* 1454 * Note that the mark_busy was already done by 1455 * mdmn_work_svc_1() 1456 */ 1457 mutex_lock(&mdmn_busy_mutex[setno]); 1458 mdmn_mark_class_unbusy(setno, orig_class); 1459 mutex_unlock(&mdmn_busy_mutex[setno]); 1460 1461 } 1462 if (err == MDMNE_CLASS_BUSY) { 1463 /* 1464 * The log is occupied with a different message 1465 * that needs to be played first. 1466 * We reject the current message with MDMNE_CLASS_BUSY 1467 * to the initiator and do not unbusy the set/class, 1468 * because we will proceed with the logged message, 1469 * which has the same set/class combination 1470 */ 1471 result->mmr_comm_state = MDMNE_CLASS_BUSY; 1472 } 1473 ret = (int *)NULL; 1474 rw_rdlock(&client_rwlock[setno]); 1475 1476 if (check_client(setno, sender)) { 1477 commd_debug(MD_MMV_SYSLOG, 1478 "proc_mas: No client for initiator \n"); 1479 } else { 1480 ret = mdmn_wakeup_initiator_1(result, 1481 client[setno][sender]); 1482 } 1483 rw_unlock(&client_rwlock[setno]); 1484 1485 if (ret == (int *)NULL) { 1486 commd_debug(MD_MMV_SYSLOG, 1487 "proc_mas: couldn't wakeup_initiator \n"); 1488 } else { 1489 if (*ret != MDMNE_ACK) { 1490 commd_debug(MD_MMV_SYSLOG, "proc_mas: " 1491 "wakeup_initiator returned %d\n", *ret); 1492 } 1493 free(ret); 1494 } 1495 free_msg(msg); 1496 1497 if (err == MDMNE_LOG_FAIL) { 1498 /* we can't proceed here */ 1499 free_result(result); 1500 rw_unlock(&set_desc_rwlock[setno]); 1501 return; 1502 } else if (err == MDMNE_CLASS_BUSY) { 1503 mdmn_changelog_record_t *lr; 1504 lr = mdmn_get_changelogrec(setno, orig_class); 1505 assert(lr != NULL); 1506 1507 /* proceed with the logged message */ 1508 msg = copy_msg(&(lr->lr_msg), NULL); 1509 1510 /* 1511 * The logged message has to have the same class but 1512 * type and sender can be different 1513 */ 1514 orig_type = msgtype = msg->msg_type; 1515 sender = msg->msg_sender; 1516 1517 commd_debug(MD_MMV_PROC_M, 1518 "proc_mas: Got new message from change log: " 1519 "(%d,0x%llx-%d) type %d\n", 1520 MSGID_ELEMS(msg->msg_msgid), msgtype); 1521 1522 /* continue normal operation with this message */ 1523 } 1524 } 1525 1526 proceed: 1527 smgen = mdmn_get_submessage_generator(msgtype); 1528 if (smgen == NULL) { 1529 /* no submessages to create, just use the original message */ 1530 msglist[0] = msg; 1531 nmsgs = 1; 1532 } else { 1533 /* some bits are passed on to submessages */ 1534 inherit_flags = msg->msg_flags & MD_MSGF_INHERIT_BITS; 1535 1536 nmsgs = smgen(msg, msglist); 1537 1538 /* some settings for the submessages */ 1539 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1540 cmsg = msglist[curmsg]; 1541 1542 /* Apply the inherited flags */ 1543 cmsg->msg_flags |= inherit_flags; 1544 1545 /* 1546 * Make sure the submessage ID is set correctly 1547 * Note: first submessage has mid_smid of 1 (not 0) 1548 */ 1549 cmsg->msg_msgid.mid_smid = curmsg + 1; 1550 1551 /* need the original class set in msgID (for MCT) */ 1552 cmsg->msg_msgid.mid_oclass = orig_class; 1553 } 1554 1555 commd_debug(MD_MMV_PROC_M, 1556 "smgen generated %d submsgs, origclass = %d\n", 1557 nmsgs, orig_class); 1558 } 1559 /* 1560 * This big loop does the following. 1561 * For all messages: 1562 * process message on the master first (a message completion 1563 * table MCT ensures a message is not processed twice) 1564 * in case of an error break out of message loop 1565 * for all nodes -- unless MD_MSGF_NO_BCAST is set -- 1566 * send message to node until that succeeds 1567 * merge result -- not yet implemented 1568 * respect MD_MSGF_STOP_ON_ERROR 1569 */ 1570 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1571 int break_msg_loop = 0; 1572 mutex_t *mx; /* protection for class_busy */ 1573 int master_err; 1574 int master_exitval = -1; 1575 1576 cmsg = msglist[curmsg]; 1577 msgtype = cmsg->msg_type; 1578 class = mdmn_get_message_class(msgtype); 1579 node = NULL; 1580 mx = mdmn_get_master_table_mx(setno, class); 1581 1582 /* If we are in the abort state, we error out immediately */ 1583 if (md_commd_global_state & MD_CGS_ABORTED) { 1584 break; /* out of the message loop */ 1585 } 1586 1587 commd_debug(MD_MMV_PROC_M, "class=%d, orig_class=%d\n", 1588 class, orig_class); 1589 /* 1590 * If the current class is different from the original class, 1591 * we have to lock it down. 1592 * The original class is already marked busy. 1593 * At this point we cannot refuse the message because the 1594 * class is busy right now, so we wait until the class becomes 1595 * available again. As soon as something changes for this set 1596 * we will be cond_signal'ed (in mdmn_mark_class_unbusy) 1597 * 1598 * Granularity could be finer (setno/class) 1599 */ 1600 if (class != orig_class) { 1601 mutex_lock(&mdmn_busy_mutex[setno]); 1602 while (mdmn_mark_class_busy(setno, class) == FALSE) { 1603 cond_wait(&mdmn_busy_cv[setno], 1604 &mdmn_busy_mutex[setno]); 1605 } 1606 mutex_unlock(&mdmn_busy_mutex[setno]); 1607 } 1608 1609 master_err = do_message_locally(cmsg, result); 1610 1611 if ((master_err != MDMNE_ACK) || 1612 ((master_err == MDMNE_ACK) && (result->mmr_exitval != 0))) { 1613 result->mmr_failing_node = set_master; 1614 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1615 /* 1616 * if appropriate, unbusy the class and 1617 * break out of the message loop 1618 */ 1619 if (class != orig_class) { 1620 mutex_lock(&mdmn_busy_mutex[setno]); 1621 mdmn_mark_class_unbusy(setno, class); 1622 mutex_unlock(&mdmn_busy_mutex[setno]); 1623 } 1624 break; 1625 } 1626 } 1627 1628 if (master_err == MDMNE_ACK) 1629 master_exitval = result->mmr_exitval; 1630 1631 /* No broadcast? => next message */ 1632 if (cmsg->msg_flags & MD_MSGF_NO_BCAST) { 1633 /* if appropriate, unbusy the class */ 1634 if (class != orig_class) { 1635 mutex_lock(&mdmn_busy_mutex[setno]); 1636 mdmn_mark_class_unbusy(setno, class); 1637 mutex_unlock(&mdmn_busy_mutex[setno]); 1638 } 1639 continue; 1640 } 1641 1642 1643 /* fake sender, so we get notified when the results are avail */ 1644 cmsg->msg_sender = set_master; 1645 /* 1646 * register to the master_table. It's needed by wakeup_master to 1647 * wakeup the sleeping thread. 1648 * Access is protected by the class lock: mdmn_mark_class_busy() 1649 */ 1650 mdmn_set_master_table_id(setno, class, &(cmsg->msg_msgid)); 1651 1652 1653 1654 rw_rdlock(&set_desc_rwlock[setno]); 1655 /* Send the message to all other nodes */ 1656 for (node = set_descriptor[setno]->sd_nodelist; node; 1657 node = node->nd_next) { 1658 md_mn_nodeid_t nid = node->nd_nodeid; 1659 1660 /* We are master and have already processed the msg */ 1661 if (node == set_descriptor[setno]->sd_mn_masternode) { 1662 continue; 1663 } 1664 1665 /* If this node didn't join the disk set, ignore it */ 1666 if ((node->nd_flags & MD_MN_NODE_OWN) == 0) { 1667 continue; 1668 } 1669 1670 mutex_lock(mx); 1671 /* 1672 * Register the node that is addressed, 1673 * so we can detect unsolicited messages 1674 */ 1675 mdmn_set_master_table_addr(setno, class, nid); 1676 slave_result = (md_mn_result_t *)NULL; 1677 1678 /* 1679 * Now send it. do_send_message() will return if 1680 * a failure occurs or 1681 * the results are available 1682 */ 1683 err = do_send_message(cmsg, node); 1684 1685 /* in abort state, we error out immediately */ 1686 if (md_commd_global_state & MD_CGS_ABORTED) { 1687 break; 1688 } 1689 1690 if (err == MDMNE_ACK) { 1691 slave_result = 1692 mdmn_get_master_table_res(setno, class); 1693 commd_debug(MD_MMV_PROC_M, 1694 "proc_mas: got result for (%d,0x%llx-%d)\n", 1695 MSGID_ELEMS(cmsg->msg_msgid)); 1696 } else if (err == MDMNE_IGNORE_NODE) { 1697 mutex_unlock(mx); 1698 continue; /* send to next node */ 1699 } 1700 mutex_unlock(mx); 1701 1702 1703 /* 1704 * If the result is NULL, or err doesn't show success, 1705 * something went wrong with this RPC call. 1706 */ 1707 if ((slave_result == NULL) || (err != MDMNE_ACK)) { 1708 /* 1709 * If PANIC_WHEN_INCONSISTENT set, 1710 * panic if the master succeeded while 1711 * this node failed 1712 */ 1713 if ((cmsg->msg_flags & 1714 MD_MSGF_PANIC_WHEN_INCONSISTENT) && 1715 (master_err == MDMNE_ACK)) 1716 panic_system(nid, cmsg->msg_type, 1717 master_err, master_exitval, 1718 slave_result); 1719 1720 result->mmr_failing_node = nid; 1721 /* are we supposed to stop in case of error? */ 1722 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1723 result->mmr_exitval = MDMNE_RPC_FAIL; 1724 commd_debug(MD_MMV_SYSLOG, "proc_mas: " 1725 "result (%d,0x%llx-%d) is NULL\n", 1726 MSGID_ELEMS(cmsg->msg_msgid)); 1727 FLUSH_DEBUGFILE(); 1728 break_msg_loop = 1; 1729 break; /* out of node loop first */ 1730 } else { 1731 /* send msg to the next node */ 1732 continue; 1733 } 1734 1735 } 1736 1737 /* 1738 * Message processed on remote node. 1739 * If PANIC_WHEN_INCONSISTENT set, panic if the 1740 * result is different on this node from the result 1741 * on the master 1742 */ 1743 if ((cmsg->msg_flags & 1744 MD_MSGF_PANIC_WHEN_INCONSISTENT) && 1745 ((master_err != MDMNE_ACK) || 1746 (slave_result->mmr_exitval != master_exitval))) 1747 panic_system(nid, cmsg->msg_type, master_err, 1748 master_exitval, slave_result); 1749 1750 /* 1751 * At this point we know we have a message that was 1752 * processed on the remote node. 1753 * We now check if the exitval is non zero. 1754 * In that case we discard the previous result and 1755 * rather use the current. 1756 * This means: If a message fails on no node, 1757 * the result from the master will be returned. 1758 * There's currently no such thing as merge of results 1759 * If additionally STOP_ON_ERROR is set, we bail out 1760 */ 1761 if (slave_result->mmr_exitval != 0) { 1762 /* throw away the previously allocated result */ 1763 free_result(result); 1764 1765 /* copy_result() allocates new memory */ 1766 result = copy_result(slave_result); 1767 free_result(slave_result); 1768 1769 dump_result(MD_MMV_PROC_M, "proc_mas", result); 1770 1771 result->mmr_failing_node = nid; 1772 if (cmsg->msg_flags & MD_MSGF_STOP_ON_ERROR) { 1773 break_msg_loop = 1; 1774 break; /* out of node loop */ 1775 } 1776 continue; /* try next node */ 1777 1778 } else { 1779 /* 1780 * MNIssue: may want to merge the results 1781 * from all slaves. Currently only report 1782 * the results from the master. 1783 */ 1784 free_result(slave_result); 1785 } 1786 1787 } /* End of loop over the nodes */ 1788 rw_unlock(&set_desc_rwlock[setno]); 1789 1790 1791 /* release the current class again */ 1792 if (class != orig_class) { 1793 mutex_lock(&mdmn_busy_mutex[setno]); 1794 mdmn_mark_class_unbusy(setno, class); 1795 mutex_unlock(&mdmn_busy_mutex[setno]); 1796 } 1797 1798 /* are we supposed to quit entirely ? */ 1799 if (break_msg_loop || 1800 (md_commd_global_state & MD_CGS_ABORTED)) { 1801 break; /* out of msg loop */ 1802 } 1803 1804 } /* End of loop over the messages */ 1805 /* 1806 * If we are here, there's two possibilities: 1807 * - we processed all messages on all nodes without an error. 1808 * In this case we return the result from the master. 1809 * (to be implemented: return the merged result) 1810 * - we encountered an error in which case result has been 1811 * set accordingly already. 1812 */ 1813 1814 if (md_commd_global_state & MD_CGS_ABORTED) { 1815 result->mmr_comm_state = MDMNE_ABORT; 1816 } 1817 1818 /* 1819 * This message has been processed completely. 1820 * Remove it from the changelog. 1821 * Do this for replay messages too. 1822 * Note that the message is unlogged before waking up the 1823 * initiator. This is done for two reasons. 1824 * 1. Remove a race condition that occurs when back to back 1825 * messages are sent for the same class, the registeration is 1826 * is lost. 1827 * 2. If the initiator died but the action was completed on all the 1828 * the nodes, we want that to be marked "done" quickly. 1829 */ 1830 1831 if ((msg->msg_flags & MD_MSGF_NO_LOG) == 0) { 1832 commd_debug(MD_MMV_PROC_M, 1833 "proc_mas: calling unlog_msg for (%d,0x%llx-%d) type %d\n", 1834 MSGID_ELEMS(msg->msg_msgid), msgtype); 1835 mdmn_unlog_msg(msg); 1836 commd_debug(MD_MMV_PROC_M, 1837 "proc_mas: done unlog_msg for (%d,0x%llx-%d) type %d\n", 1838 MSGID_ELEMS(msg->msg_msgid), msgtype); 1839 } 1840 1841 /* 1842 * In case of submessages, we increased the submessage ID in the 1843 * result structure. We restore the message ID to the value that 1844 * the initiator is waiting for. 1845 */ 1846 result->mmr_msgid.mid_smid = 0; 1847 result->mmr_msgtype = orig_type; 1848 result->mmr_sender = set_master; 1849 1850 /* if we have an inited client, send result */ 1851 ret = (int *)NULL; 1852 1853 rw_rdlock(&client_rwlock[setno]); 1854 if (check_client(setno, sender)) { 1855 commd_debug(MD_MMV_SYSLOG, 1856 "proc_mas: unable to create client for initiator\n"); 1857 } else { 1858 ret = mdmn_wakeup_initiator_1(result, client[setno][sender]); 1859 } 1860 rw_unlock(&client_rwlock[setno]); 1861 1862 if (ret == (int *)NULL) { 1863 commd_debug(MD_MMV_PROC_M, 1864 "proc_mas: couldn't wakeup initiator\n"); 1865 } else { 1866 if (*ret != MDMNE_ACK) { 1867 commd_debug(MD_MMV_PROC_M, 1868 "proc_mas: wakeup_initiator returned %d\n", 1869 *ret); 1870 } 1871 free(ret); 1872 } 1873 1874 rw_unlock(&set_desc_rwlock[setno]); 1875 /* Free all submessages, if there were any */ 1876 if (nmsgs > 1) { 1877 for (curmsg = 0; curmsg < nmsgs; curmsg++) { 1878 free_msg(msglist[curmsg]); 1879 } 1880 } 1881 /* Free the result */ 1882 free_result(result); 1883 1884 mutex_lock(&mdmn_busy_mutex[setno]); 1885 mdmn_mark_class_unbusy(setno, orig_class); 1886 mutex_unlock(&mdmn_busy_mutex[setno]); 1887 1888 1889 /* 1890 * We use this ioctl just to get the time in the same format as used in 1891 * the messageID. If it fails, all we get is a bad runtime output. 1892 */ 1893 (void) metaioctl(MD_IOCGUNIQMSGID, &dummyid, &mde, NULL); 1894 secdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) >> 32; 1895 usecdiff = (dummyid.mid_time - msg->msg_msgid.mid_time) & 0xfffff; 1896 1897 /* catching possible overflow */ 1898 if (usecdiff >= 1000000) { 1899 usecdiff -= 1000000; 1900 secdiff++; 1901 } 1902 1903 1904 commd_debug(MD_MMV_PROC_M, "proc_mas: done (%d, 0x%llx-%d) type=%02d " 1905 "%5d.%06d secs runtime\n", 1906 MSGID_ELEMS(msg->msg_msgid), orig_type, secdiff, usecdiff); 1907 1908 /* Free the original message */ 1909 free_msg(msg); 1910 } 1911 1912 void 1913 mdmn_slave_process_msg(md_mn_msg_t *msg) 1914 { 1915 int *ret = NULL; 1916 int completed; 1917 int retries; 1918 int successfully_returned; 1919 set_t setno; 1920 md_mn_result_t *result; 1921 md_mn_nodeid_t sender; 1922 md_mn_nodeid_t whoami; 1923 md_mn_msgtype_t msgtype; 1924 md_mn_msgclass_t class; 1925 1926 void (*handler)(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res); 1927 1928 setno = msg->msg_setno; 1929 sender = msg->msg_sender; /* this is always the master of the set */ 1930 msgtype = msg->msg_type; 1931 1932 rw_rdlock(&set_desc_rwlock[setno]); 1933 whoami = set_descriptor[setno]->sd_mn_mynode->nd_nodeid; 1934 rw_unlock(&set_desc_rwlock[setno]); 1935 1936 result = Zalloc(sizeof (md_mn_result_t)); 1937 result->mmr_flags = msg->msg_flags; 1938 result->mmr_setno = setno; 1939 result->mmr_msgtype = msgtype; 1940 result->mmr_sender = whoami; 1941 result->mmr_comm_state = MDMNE_ACK; /* Ok state */ 1942 MSGID_COPY(&(msg->msg_msgid), &(result->mmr_msgid)); 1943 class = mdmn_get_message_class(msgtype); 1944 1945 commd_debug(MD_MMV_PROC_S, 1946 "proc_sla: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 1947 MSGID_ELEMS(msg->msg_msgid), setno, class, msgtype); 1948 1949 handler = mdmn_get_handler(msgtype); 1950 1951 if (handler == NULL) { 1952 result->mmr_exitval = 0; 1953 /* let the sender decide if this is an error or not */ 1954 result->mmr_comm_state = MDMNE_NO_HANDLER; 1955 commd_debug(MD_MMV_PROC_S, 1956 "proc_sla: No handler for (%d, 0x%llx-%d)\n", 1957 MSGID_ELEMS(msg->msg_msgid)); 1958 } else { 1959 1960 /* Did we already process this message ? */ 1961 mutex_lock(&mct_mutex[setno][class]); 1962 completed = mdmn_check_completion(msg, result); 1963 1964 if (completed == MDMN_MCT_NOT_DONE) { 1965 /* message not yet processed locally */ 1966 commd_debug(MD_MMV_PROC_S, 1967 "proc_sla: calling handler for (%d, 0x%llx-%d)\n", 1968 MSGID_ELEMS(msg->msg_msgid)); 1969 1970 /* 1971 * Mark the message as being currently processed, 1972 * so we won't start a second handler for it 1973 */ 1974 (void) mdmn_mark_completion(msg, NULL, 1975 MDMN_MCT_IN_PROGRESS); 1976 1977 mutex_unlock(&mct_mutex[setno][class]); 1978 (*handler)(msg, MD_MSGF_ON_SLAVE, result); 1979 1980 commd_debug(MD_MMV_PROC_S, 1981 "proc_sla: finished handler for (%d, 0x%llx-%d)\n", 1982 MSGID_ELEMS(msg->msg_msgid)); 1983 1984 mutex_lock(&mct_mutex[setno][class]); 1985 /* Mark the message as fully done, store the result */ 1986 (void) mdmn_mark_completion(msg, result, MDMN_MCT_DONE); 1987 1988 } else if (completed == MDMN_MCT_DONE) { 1989 /* message processed previously, got result from MCT */ 1990 commd_debug(MD_MMV_PROC_S, 1991 "proc_sla: result for (%d, 0x%llx-%d) from MCT\n", 1992 MSGID_ELEMS(msg->msg_msgid)); 1993 } else if (completed == MDMN_MCT_IN_PROGRESS) { 1994 /* 1995 * If the message is curruntly being processed, 1996 * we can return here, without sending a result back. 1997 * This will be done by the initial message handling 1998 * thread 1999 */ 2000 mutex_unlock(&mct_mutex[setno][class]); 2001 commd_debug(MD_MMV_PROC_M, "proc_sla: " 2002 "(%d, 0x%llx-%d) is currently being processed\n", 2003 MSGID_ELEMS(msg->msg_msgid), msgtype); 2004 2005 free_msg(msg); 2006 free_result(result); 2007 return; 2008 } else { 2009 /* MCT error occurred (should never happen) */ 2010 result->mmr_comm_state = MDMNE_LOG_FAIL; 2011 commd_debug(MD_MMV_PROC_S, 2012 "proc_sla: MCT error for (%d, 0x%llx-%d)\n", 2013 MSGID_ELEMS(msg->msg_msgid)); 2014 } 2015 mutex_unlock(&mct_mutex[setno][class]); 2016 } 2017 2018 /* 2019 * At this point we have a result (even in an error case) 2020 * that we return to the master. 2021 */ 2022 rw_rdlock(&set_desc_rwlock[setno]); 2023 retries = 2; /* we will try two times to send the results */ 2024 successfully_returned = 0; 2025 2026 while (!successfully_returned && (retries != 0)) { 2027 ret = (int *)NULL; 2028 rw_rdlock(&client_rwlock[setno]); 2029 if (check_client(setno, sender)) { 2030 /* 2031 * If we cannot setup the rpc connection to the master, 2032 * we can't do anything besides logging this fact. 2033 */ 2034 commd_debug(MD_MMV_SYSLOG, 2035 "proc_mas: unable to create client for master\n"); 2036 rw_unlock(&client_rwlock[setno]); 2037 break; 2038 } else { 2039 ret = mdmn_wakeup_master_1(result, 2040 client[setno][sender]); 2041 /* 2042 * if mdmn_wakeup_master_1 returns NULL, it can be that 2043 * the master (or the commd on the master) had died. 2044 * In that case, we destroy the client to the master 2045 * and retry. 2046 * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK, 2047 * the commd on the master is alive but 2048 * something else is wrong, 2049 * in that case a retry doesn't make sense => break out 2050 */ 2051 if (ret == (int *)NULL) { 2052 commd_debug(MD_MMV_PROC_S, 2053 "proc_sla: wakeup_master returned NULL\n"); 2054 /* release reader lock, grab writer lock */ 2055 rw_unlock(&client_rwlock[setno]); 2056 rw_wrlock(&client_rwlock[setno]); 2057 mdmn_clnt_destroy(client[setno][sender]); 2058 if (client[setno][sender] != (CLIENT *)NULL) { 2059 client[setno][sender] = (CLIENT *)NULL; 2060 } 2061 rw_unlock(&client_rwlock[setno]); 2062 retries--; 2063 commd_debug(MD_MMV_PROC_S, 2064 "retries = %d\n", retries); 2065 continue; 2066 } 2067 if (*ret != MDMNE_ACK) { 2068 commd_debug(MD_MMV_PROC_S, "proc_sla: " 2069 "wakeup_master returned %d\n", *ret); 2070 rw_unlock(&client_rwlock[setno]); 2071 break; 2072 } else { /* Good case */ 2073 successfully_returned = 1; 2074 rw_unlock(&client_rwlock[setno]); 2075 } 2076 } 2077 } 2078 2079 rw_unlock(&set_desc_rwlock[setno]); 2080 commd_debug(MD_MMV_PROC_S, "proc_sla: done (%d, 0x%llx-%d)\n", 2081 MSGID_ELEMS(msg->msg_msgid)); 2082 2083 if (ret != (int *)NULL) 2084 free(ret); 2085 free_msg(msg); 2086 free_result(result); 2087 } 2088 2089 2090 md_mn_result_t * 2091 mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) 2092 { 2093 int err; 2094 set_t setno; 2095 SVCXPRT *transp = rqstp->rq_xprt; 2096 md_mn_msg_t *msg; 2097 md_mn_result_t *resultp; 2098 md_mn_msgclass_t class; 2099 md_mn_msg_and_transp_t *matp; 2100 2101 msg = copy_msg(omsg, NULL); 2102 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2103 2104 setno = msg->msg_setno; 2105 class = mdmn_get_message_class(msg->msg_type); 2106 2107 /* If we are in the abort state, we error out immediately */ 2108 if (md_commd_global_state & MD_CGS_ABORTED) { 2109 resultp = Zalloc(sizeof (md_mn_result_t)); 2110 resultp->mmr_comm_state = MDMNE_ABORT; 2111 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2112 free_result(resultp); 2113 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2114 return (NULL); 2115 } 2116 2117 /* check if the global initialization is done */ 2118 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2119 global_init(); 2120 } 2121 2122 commd_debug(MD_MMV_SEND, 2123 "send: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 2124 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 2125 2126 /* Check for verbosity related message */ 2127 if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 2128 md_mn_verbose_t *d; 2129 2130 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 2131 md_commd_global_verb = d->mmv_what; 2132 /* everytime the bitmask is set, we reset the timer */ 2133 __savetime = gethrtime(); 2134 /* 2135 * If local-only-flag is set, we are done here, 2136 * otherwise we pass that message on to the master. 2137 */ 2138 if (msg->msg_flags & MD_MSGF_LOCAL_ONLY) { 2139 resultp = Zalloc(sizeof (md_mn_result_t)); 2140 resultp->mmr_comm_state = MDMNE_ACK; 2141 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 2142 (char *)resultp); 2143 free_result(resultp); 2144 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2145 return (NULL); 2146 } 2147 } 2148 2149 /* 2150 * Are we entering the abort state? 2151 * Here we don't even need to check for MD_MSGF_LOCAL_ONLY, because 2152 * this message cannot be distributed anyway. 2153 * So, it's safe to return immediately. 2154 */ 2155 if (msg->msg_type == MD_MN_MSG_ABORT) { 2156 md_commd_global_state |= MD_CGS_ABORTED; 2157 resultp = Zalloc(sizeof (md_mn_result_t)); 2158 resultp->mmr_comm_state = MDMNE_ACK; 2159 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2160 free_result(resultp); 2161 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2162 return (NULL); 2163 } 2164 2165 2166 /* 2167 * Is this message type blocked? 2168 * If so we return MDMNE_CLASS_LOCKED, immediately 2169 */ 2170 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 2171 resultp = Zalloc(sizeof (md_mn_result_t)); 2172 resultp->mmr_comm_state = MDMNE_CLASS_LOCKED; 2173 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2174 free_result(resultp); 2175 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2176 commd_debug(MD_MMV_SEND, 2177 "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, " 2178 "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class, 2179 msg->msg_type); 2180 return (NULL); 2181 } 2182 2183 2184 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2185 /* Can only use the appropriate mutexes if they are inited */ 2186 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2187 rw_wrlock(&set_desc_rwlock[setno]); 2188 rw_wrlock(&client_rwlock[setno]); 2189 err = mdmn_init_set(setno, MDMN_SET_READY); 2190 rw_unlock(&client_rwlock[setno]); 2191 rw_unlock(&set_desc_rwlock[setno]); 2192 } else { 2193 err = mdmn_init_set(setno, MDMN_SET_READY); 2194 } 2195 2196 if (err) { 2197 /* couldn't initialize connections, cannot proceed */ 2198 resultp = Zalloc(sizeof (md_mn_result_t)); 2199 resultp->mmr_comm_state = err; 2200 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, 2201 (char *)resultp); 2202 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2203 free_result(resultp); 2204 commd_debug(MD_MMV_SEND, 2205 "send: init err = %d\n", err); 2206 return (NULL); 2207 } 2208 } 2209 2210 mutex_lock(&mdmn_busy_mutex[setno]); 2211 if ((mdmn_is_class_suspended(setno, class) == TRUE) && 2212 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 2213 mutex_unlock(&mdmn_busy_mutex[setno]); 2214 resultp = Zalloc(sizeof (md_mn_result_t)); 2215 resultp->mmr_comm_state = MDMNE_SUSPENDED; 2216 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp); 2217 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2218 free_result(resultp); 2219 commd_debug(MD_MMV_SEND, 2220 "send: class suspended (%d, 0x%llx-%d), set=%d, " 2221 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 2222 setno, class, msg->msg_type); 2223 return (NULL); 2224 } 2225 mutex_unlock(&mdmn_busy_mutex[setno]); 2226 2227 /* is this rpc request coming from the local node? */ 2228 if (check_license(rqstp, 0) == FALSE) { 2229 svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg); 2230 commd_debug(MD_MMV_SEND, 2231 "send: check licence fail(%d, 0x%llx-%d), set=%d, " 2232 "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid), 2233 setno, class, msg->msg_type); 2234 return (NULL); 2235 } 2236 2237 2238 /* 2239 * We allocate a structure that can take two pointers in order to pass 2240 * both the message and the transp into thread_create. 2241 * The free for this alloc is done in mdmn_send_to_work() 2242 */ 2243 matp = Malloc(sizeof (md_mn_msg_and_transp_t)); 2244 matp->mat_msg = msg; 2245 matp->mat_transp = transp; 2246 2247 /* 2248 * create a thread here that calls work on the master. 2249 * If we are already on the master, this would block if running 2250 * in the same context. (our service is single threaded)( 2251 * Make it a detached thread because it will not communicate with 2252 * anybody thru thr_* mechanisms 2253 */ 2254 thr_create(NULL, 0, mdmn_send_to_work, (void *) matp, THR_DETACHED, 2255 NULL); 2256 2257 commd_debug(MD_MMV_SEND, "send: done (%d, 0x%llx-%d)\n", 2258 MSGID_ELEMS(msg->msg_msgid)); 2259 /* 2260 * We return here without sending results. This will be done by 2261 * mdmn_wakeup_initiator_svc_1() as soon as the results are available. 2262 * Until then the calling send_message will be blocked, while we 2263 * are able to take calls. 2264 */ 2265 2266 return (NULL); 2267 } 2268 2269 /* ARGSUSED */ 2270 int * 2271 mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp) 2272 { 2273 int err; 2274 set_t setno; 2275 thread_t tid; 2276 int *retval; 2277 md_mn_msg_t *msg; 2278 md_mn_msgclass_t class; 2279 2280 retval = Malloc(sizeof (int)); 2281 2282 /* If we are in the abort state, we error out immediately */ 2283 if (md_commd_global_state & MD_CGS_ABORTED) { 2284 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2285 *retval = MDMNE_ABORT; 2286 return (retval); 2287 } 2288 2289 msg = copy_msg(omsg, NULL); 2290 xdr_free(xdr_md_mn_msg_t, (caddr_t)omsg); 2291 2292 /* 2293 * Is this message type blocked? 2294 * If so we return MDMNE_CLASS_LOCKED, immediately. 2295 * This check is performed on master and slave. 2296 */ 2297 if (msgtype_lock_state[msg->msg_type] == MMTL_LOCK) { 2298 *retval = MDMNE_CLASS_LOCKED; 2299 return (retval); 2300 } 2301 2302 /* check if the global initialization is done */ 2303 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2304 global_init(); 2305 } 2306 2307 class = mdmn_get_message_class(msg->msg_type); 2308 setno = msg->msg_setno; 2309 2310 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2311 /* Can only use the appropriate mutexes if they are inited */ 2312 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2313 rw_wrlock(&set_desc_rwlock[setno]); 2314 rw_wrlock(&client_rwlock[setno]); 2315 err = mdmn_init_set(setno, MDMN_SET_READY); 2316 rw_unlock(&client_rwlock[setno]); 2317 rw_unlock(&set_desc_rwlock[setno]); 2318 } else { 2319 err = mdmn_init_set(setno, MDMN_SET_READY); 2320 } 2321 2322 if (err) { 2323 *retval = MDMNE_CANNOT_CONNECT; 2324 free_msg(msg); 2325 return (retval); 2326 } 2327 } 2328 2329 /* is this rpc request coming from a licensed node? */ 2330 if (check_license(rqstp, msg->msg_sender) == FALSE) { 2331 free_msg(msg); 2332 *retval = MDMNE_RPC_FAIL; 2333 return (retval); 2334 } 2335 2336 commd_debug(MD_MMV_WORK, 2337 "work: received (%d, 0x%llx-%d), set=%d, class=%d, type=%d, " 2338 "flags=0x%x\n", 2339 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type, 2340 msg->msg_flags); 2341 2342 /* Check for various CLASS0 message types */ 2343 if (msg->msg_type == MD_MN_MSG_VERBOSITY) { 2344 md_mn_verbose_t *d; 2345 2346 d = (md_mn_verbose_t *)((void *)(msg->msg_event_data)); 2347 /* for now we ignore set / class in md_mn_verbose_t */ 2348 md_commd_global_verb = d->mmv_what; 2349 /* everytime the bitmask is set, we reset the timer */ 2350 __savetime = gethrtime(); 2351 } 2352 2353 mutex_lock(&mdmn_busy_mutex[setno]); 2354 2355 /* check if class is locked via a call to mdmn_comm_lock_svc_1 */ 2356 if (mdmn_is_class_locked(setno, class) == TRUE) { 2357 mutex_unlock(&mdmn_busy_mutex[setno]); 2358 *retval = MDMNE_CLASS_LOCKED; 2359 free_msg(msg); 2360 return (retval); 2361 } 2362 mutex_unlock(&mdmn_busy_mutex[setno]); 2363 2364 /* Check if the class is busy right now. Do it only on the master */ 2365 rw_rdlock(&set_desc_rwlock[setno]); 2366 if (set_descriptor[setno]->sd_mn_am_i_master) { 2367 rw_unlock(&set_desc_rwlock[setno]); 2368 /* 2369 * If the class is currently suspended, don't accept new 2370 * messages, unless they are flagged with an override bit. 2371 */ 2372 mutex_lock(&mdmn_busy_mutex[setno]); 2373 if ((mdmn_is_class_suspended(setno, class) == TRUE) && 2374 ((msg->msg_flags & MD_MSGF_OVERRIDE_SUSPEND) == 0)) { 2375 mutex_unlock(&mdmn_busy_mutex[setno]); 2376 *retval = MDMNE_SUSPENDED; 2377 commd_debug(MD_MMV_SEND, 2378 "send: set %d is suspended\n", setno); 2379 free_msg(msg); 2380 return (retval); 2381 } 2382 if (mdmn_mark_class_busy(setno, class) == FALSE) { 2383 mutex_unlock(&mdmn_busy_mutex[setno]); 2384 *retval = MDMNE_CLASS_BUSY; 2385 free_msg(msg); 2386 return (retval); 2387 } 2388 mutex_unlock(&mdmn_busy_mutex[setno]); 2389 /* 2390 * Because the real processing of the message takes time we 2391 * create a thread for it. So the master thread can continue 2392 * to run and accept further messages. 2393 */ 2394 *retval = thr_create(NULL, 0, 2395 (void *(*)(void *))mdmn_master_process_msg, (void *)msg, 2396 THR_DETACHED|THR_SUSPENDED, &tid); 2397 } else { 2398 rw_unlock(&set_desc_rwlock[setno]); 2399 *retval = thr_create(NULL, 0, 2400 (void *(*)(void *)) mdmn_slave_process_msg, (void *)msg, 2401 THR_DETACHED|THR_SUSPENDED, &tid); 2402 } 2403 2404 if (*retval != 0) { 2405 *retval = MDMNE_THR_CREATE_FAIL; 2406 free_msg(msg); 2407 return (retval); 2408 } 2409 2410 /* Now run the new thread */ 2411 thr_continue(tid); 2412 2413 commd_debug(MD_MMV_WORK, 2414 "work: done (%d, 0x%llx-%d), set=%d, class=%d, type=%d\n", 2415 MSGID_ELEMS(msg->msg_msgid), setno, class, msg->msg_type); 2416 2417 *retval = MDMNE_ACK; /* this means success */ 2418 return (retval); 2419 } 2420 2421 /* ARGSUSED */ 2422 int * 2423 mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp) 2424 { 2425 2426 int *retval; 2427 int err; 2428 set_t setno; 2429 mutex_t *mx; /* protection of initiator_table */ 2430 SVCXPRT *transp; 2431 md_mn_msgid_t initiator_table_id; 2432 md_mn_msgclass_t class; 2433 2434 retval = Malloc(sizeof (int)); 2435 2436 /* check if the global initialization is done */ 2437 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2438 global_init(); 2439 } 2440 2441 setno = res->mmr_setno; 2442 2443 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2444 /* set not ready means we just crashed are restarted now */ 2445 /* Can only use the appropriate mutexes if they are inited */ 2446 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2447 rw_wrlock(&set_desc_rwlock[setno]); 2448 rw_wrlock(&client_rwlock[setno]); 2449 err = mdmn_init_set(setno, MDMN_SET_READY); 2450 rw_unlock(&client_rwlock[setno]); 2451 rw_unlock(&set_desc_rwlock[setno]); 2452 } else { 2453 err = mdmn_init_set(setno, MDMN_SET_READY); 2454 } 2455 2456 if (err) { 2457 *retval = MDMNE_CANNOT_CONNECT; 2458 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2459 return (retval); 2460 } 2461 } 2462 2463 /* is this rpc request coming from a licensed node? */ 2464 if (check_license(rqstp, res->mmr_sender) == FALSE) { 2465 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2466 *retval = MDMNE_RPC_FAIL; 2467 return (retval); 2468 } 2469 2470 2471 class = mdmn_get_message_class(res->mmr_msgtype); 2472 mx = mdmn_get_initiator_table_mx(setno, class); 2473 2474 commd_debug(MD_MMV_WAKE_I, 2475 "wake_ini: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d\n", 2476 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype); 2477 2478 mutex_lock(mx); 2479 2480 /* 2481 * Search the initiator wakeup table. 2482 * If we find an entry here (which should always be true) 2483 * we are on the initiating node and we wakeup the original 2484 * local rpc call 2485 */ 2486 mdmn_get_initiator_table_id(setno, class, &initiator_table_id); 2487 2488 if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) { 2489 transp = mdmn_get_initiator_table_transp(setno, class); 2490 mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res); 2491 mdmn_unregister_initiator_table(setno, class); 2492 *retval = MDMNE_ACK; 2493 2494 commd_debug(MD_MMV_WAKE_I, 2495 "wake_ini: replied (%d, 0x%llx-%d)\n", 2496 MSGID_ELEMS(res->mmr_msgid)); 2497 } else { 2498 commd_debug(MD_MMV_WAKE_I, 2499 "wakeup initiator: unsolicited message (%d, 0x%llx-%d)\n", 2500 MSGID_ELEMS(res->mmr_msgid)); 2501 *retval = MDMNE_NO_WAKEUP_ENTRY; 2502 } 2503 mutex_unlock(mx); 2504 /* less work for check_timeouts */ 2505 mutex_lock(&check_timeout_mutex); 2506 if (messages_on_their_way == 0) { 2507 commd_debug(MD_MMV_WAKE_I, 2508 "Oops, messages_on_their_way < 0 (%d, 0x%llx-%d)\n", 2509 MSGID_ELEMS(res->mmr_msgid)); 2510 } else { 2511 messages_on_their_way--; 2512 } 2513 mutex_unlock(&check_timeout_mutex); 2514 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2515 2516 return (retval); 2517 } 2518 2519 2520 /* 2521 * res must be free'd by the thread we wake up 2522 */ 2523 /* ARGSUSED */ 2524 int * 2525 mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp) 2526 { 2527 2528 int *retval; 2529 int err; 2530 set_t setno; 2531 cond_t *cv; 2532 mutex_t *mx; 2533 md_mn_msgid_t master_table_id; 2534 md_mn_nodeid_t sender; 2535 md_mn_result_t *res; 2536 md_mn_msgclass_t class; 2537 2538 retval = Malloc(sizeof (int)); 2539 2540 /* check if the global initialization is done */ 2541 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2542 global_init(); 2543 } 2544 2545 /* Need to copy the results here, as they are static for RPC */ 2546 res = copy_result(ores); 2547 xdr_free(xdr_md_mn_result_t, (caddr_t)ores); 2548 2549 class = mdmn_get_message_class(res->mmr_msgtype); 2550 setno = res->mmr_setno; 2551 2552 if (md_mn_set_inited[setno] != MDMN_SET_READY) { 2553 /* set not ready means we just crashed are restarted now */ 2554 /* Can only use the appropriate mutexes if they are inited */ 2555 if (md_mn_set_inited[setno] & MDMN_SET_MUTEXES) { 2556 rw_wrlock(&set_desc_rwlock[setno]); 2557 rw_wrlock(&client_rwlock[setno]); 2558 err = mdmn_init_set(setno, MDMN_SET_READY); 2559 rw_unlock(&client_rwlock[setno]); 2560 rw_unlock(&set_desc_rwlock[setno]); 2561 } else { 2562 err = mdmn_init_set(setno, MDMN_SET_READY); 2563 } 2564 2565 if (err) { 2566 *retval = MDMNE_CANNOT_CONNECT; 2567 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2568 return (retval); 2569 } 2570 } 2571 2572 /* is this rpc request coming from a licensed node? */ 2573 if (check_license(rqstp, res->mmr_sender) == FALSE) { 2574 *retval = MDMNE_RPC_FAIL; 2575 xdr_free(xdr_md_mn_result_t, (caddr_t)res); 2576 return (retval); 2577 } 2578 2579 2580 commd_debug(MD_MMV_WAKE_M, 2581 "wake_mas: received (%d, 0x%llx-%d) set=%d, class=%d, type=%d " 2582 "from %d\n", 2583 MSGID_ELEMS(res->mmr_msgid), setno, class, res->mmr_msgtype, 2584 res->mmr_sender); 2585 /* 2586 * The mutex and cv are needed for waking up the thread 2587 * sleeping in mdmn_master_process_msg() 2588 */ 2589 mx = mdmn_get_master_table_mx(setno, class); 2590 cv = mdmn_get_master_table_cv(setno, class); 2591 2592 /* 2593 * lookup the master wakeup table 2594 * If we find our message, we are on the master and 2595 * called by a slave that finished processing a message. 2596 * We store the results in the appropriate slot and 2597 * wakeup the thread (mdmn_master_process_msg()) waiting for them. 2598 */ 2599 mutex_lock(mx); 2600 mdmn_get_master_table_id(setno, class, &master_table_id); 2601 sender = mdmn_get_master_table_addr(setno, class); 2602 2603 if (MSGID_CMP(&(master_table_id), &(res->mmr_msgid))) { 2604 if (sender == res->mmr_sender) { 2605 mdmn_set_master_table_res(setno, class, res); 2606 cond_signal(cv); 2607 *retval = MDMNE_ACK; 2608 } else { 2609 /* id is correct but wrong sender (I smell a timeout) */ 2610 commd_debug(MD_MMV_WAKE_M, 2611 "wakeup master got unsolicited message: " 2612 "(%d, 0x%llx-%d) from %d\n", 2613 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender); 2614 free_result(res); 2615 *retval = MDMNE_TIMEOUT; 2616 } 2617 } else { 2618 /* id is wrong, smells like a very late timeout */ 2619 commd_debug(MD_MMV_WAKE_M, 2620 "wakeup master got unsolicited message: " 2621 "(%d, 0x%llx-%d) from %d, expected (%d, 0x%llx-%d)\n", 2622 MSGID_ELEMS(res->mmr_msgid), res->mmr_sender, 2623 MSGID_ELEMS(master_table_id)); 2624 free_result(res); 2625 *retval = MDMNE_NO_WAKEUP_ENTRY; 2626 } 2627 2628 mutex_unlock(mx); 2629 2630 return (retval); 2631 } 2632 2633 /* 2634 * Lock a set/class combination. 2635 * This is mainly done for debug purpose. 2636 * This set/class combination immediately is blocked, 2637 * even in the middle of sending messages to multiple slaves. 2638 * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same 2639 * set/class combination. 2640 * 2641 * Special messages of class MD_MSG_CLASS0 can never be locked. 2642 * e.g. MD_MN_MSG_VERBOSITY, MD_MN_MSG_ABORT 2643 * 2644 * That means, if MD_MSG_CLASS0 is specified, we lock all classes from 2645 * >= MD_MSG_CLASS1 to < MD_MN_NCLASSES 2646 * 2647 * set must be between 1 and MD_MAXSETS 2648 * class can be: 2649 * MD_MSG_CLASS0 which means all other classes in this case 2650 * or one specific class (< MD_MN_NCLASSES) 2651 * 2652 * Returns: 2653 * MDMNE_ACK on sucess (locking a locked class is Ok) 2654 * MDMNE_EINVAL if a parameter is out of range 2655 */ 2656 2657 /* ARGSUSED */ 2658 int * 2659 mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2660 { 2661 int *retval; 2662 set_t setno = msc->msc_set; 2663 md_mn_msgclass_t class = msc->msc_class; 2664 2665 retval = Malloc(sizeof (int)); 2666 2667 /* check if the global initialization is done */ 2668 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2669 global_init(); 2670 } 2671 2672 /* is this rpc request coming from the local node ? */ 2673 if (check_license(rqstp, 0) == FALSE) { 2674 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2675 *retval = MDMNE_RPC_FAIL; 2676 return (retval); 2677 } 2678 2679 /* Perform some range checking */ 2680 if ((setno == 0) || (setno >= MD_MAXSETS) || 2681 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 2682 *retval = MDMNE_EINVAL; 2683 return (retval); 2684 } 2685 2686 commd_debug(MD_MMV_MISC, "lock: set=%d, class=%d\n", setno, class); 2687 mutex_lock(&mdmn_busy_mutex[setno]); 2688 if (class != MD_MSG_CLASS0) { 2689 mdmn_mark_class_locked(setno, class); 2690 } else { 2691 /* MD_MSG_CLASS0 is used as a wild card for all classes */ 2692 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 2693 mdmn_mark_class_locked(setno, class); 2694 } 2695 } 2696 mutex_unlock(&mdmn_busy_mutex[setno]); 2697 2698 *retval = MDMNE_ACK; 2699 return (retval); 2700 } 2701 2702 /* 2703 * Unlock a set/class combination. 2704 * set must be between 1 and MD_MAXSETS 2705 * class can be: 2706 * MD_MSG_CLASS0 which means all other classes in this case (like above) 2707 * or one specific class (< MD_MN_NCLASSES) 2708 * 2709 * Returns: 2710 * MDMNE_ACK on sucess (unlocking an unlocked class is Ok) 2711 * MDMNE_EINVAL if a parameter is out of range 2712 */ 2713 /* ARGSUSED */ 2714 int * 2715 mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2716 { 2717 int *retval; 2718 set_t setno = msc->msc_set; 2719 md_mn_msgclass_t class = msc->msc_class; 2720 2721 retval = Malloc(sizeof (int)); 2722 2723 /* check if the global initialization is done */ 2724 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2725 global_init(); 2726 } 2727 2728 /* is this rpc request coming from the local node ? */ 2729 if (check_license(rqstp, 0) == FALSE) { 2730 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2731 *retval = MDMNE_RPC_FAIL; 2732 return (retval); 2733 } 2734 2735 /* Perform some range checking */ 2736 if ((setno == 0) || (setno >= MD_MAXSETS) || 2737 (class < MD_MSG_CLASS0) || (class >= MD_MN_NCLASSES)) { 2738 *retval = MDMNE_EINVAL; 2739 return (retval); 2740 } 2741 commd_debug(MD_MMV_MISC, "unlock: set=%d, class=%d\n", setno, class); 2742 2743 mutex_lock(&mdmn_busy_mutex[setno]); 2744 if (class != MD_MSG_CLASS0) { 2745 mdmn_mark_class_unlocked(setno, class); 2746 } else { 2747 /* MD_MSG_CLASS0 is used as a wild card for all classes */ 2748 for (class = MD_MSG_CLASS1; class < MD_MN_NCLASSES; class++) { 2749 mdmn_mark_class_unlocked(setno, class); 2750 } 2751 } 2752 mutex_unlock(&mdmn_busy_mutex[setno]); 2753 2754 *retval = MDMNE_ACK; 2755 return (retval); 2756 } 2757 2758 /* 2759 * mdmn_comm_suspend_svc_1(setno, class) 2760 * 2761 * Drain all outstanding messages for a given set/class combination 2762 * and don't allow new messages to be processed. 2763 * 2764 * Special messages of class MD_MSG_CLASS0 can never be locked. 2765 * e.g. MD_MN_MSG_VERBOSITY 2766 * 2767 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 2768 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 2769 * 2770 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 2771 * one class as being suspended. 2772 * If messages for this class are currently on their way, 2773 * MDMNE_SET_NOT_DRAINED is returned. Otherwise MDMNE_ACK is returned. 2774 * 2775 * If class _is_ MD_COMM_ALL_CLASSES we drain all classes of this set. 2776 * Messages must be generated in ascending order. 2777 * This means, a message cannot create submessages with the same or lower class. 2778 * Draining messages must go from 1 to NCLASSES in order to ensure we don't 2779 * generate a hanging situation here. 2780 * We mark class 1 as being suspended. 2781 * if the class is not busy, we proceed with class 2 2782 * and so on 2783 * if a class *is* busy, we cannot continue here, but return 2784 * MDMNE_SET_NOT_DRAINED. 2785 * We expect the caller to hold on for some seconds and try again. 2786 * When that message, that held the class busy is done in 2787 * mdmn_master_process_msg(), mdmn_mark_class_unbusy() called. 2788 * There it is checked if the class is about to drain. 2789 * In that case it tries to drain all higher classes there. 2790 * 2791 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 2792 * In that case we return MDMNE_SET_NOT_DRAINED if not all sets are 2793 * completely drained. 2794 * 2795 * Returns: 2796 * MDMNE_ACK on sucess (set is drained, no outstanding messages) 2797 * MDMNE_SET_NOT_DRAINED if drain process is started, but there are 2798 * still outstanding messages for this set(s) 2799 * MDMNE_EINVAL if setno is out of range 2800 * MDMNE_NOT_JOINED if the set is not yet initialized on this node 2801 */ 2802 2803 /* ARGSUSED */ 2804 int * 2805 mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2806 { 2807 int *retval; 2808 int failure = 0; 2809 set_t startset, endset; 2810 set_t setno = msc->msc_set; 2811 md_mn_msgclass_t oclass = msc->msc_class; 2812 #ifdef NOT_YET_NEEDED 2813 uint_t flags = msc->msc_flags; 2814 #endif /* NOT_YET_NEEDED */ 2815 md_mn_msgclass_t class; 2816 2817 retval = Malloc(sizeof (int)); 2818 2819 /* check if the global initialization is done */ 2820 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2821 global_init(); 2822 } 2823 2824 /* is this rpc request coming from the local node ? */ 2825 if (check_license(rqstp, 0) == FALSE) { 2826 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2827 *retval = MDMNE_RPC_FAIL; 2828 return (retval); 2829 } 2830 2831 commd_debug(MD_MMV_MISC, "suspend: called for set=%d class=%d\n", 2832 setno, oclass); 2833 2834 /* Perform some range checking */ 2835 if (setno >= MD_MAXSETS) { 2836 *retval = MDMNE_EINVAL; 2837 commd_debug(MD_MMV_MISC, "suspend: returning MDMNE_EINVAL\n"); 2838 return (retval); 2839 } 2840 2841 /* setno == MD_COMM_ALL_SETS means: we walk thru all possible sets. */ 2842 if (setno == MD_COMM_ALL_SETS) { 2843 startset = 1; 2844 endset = MD_MAXSETS - 1; 2845 } else { 2846 startset = setno; 2847 endset = setno; 2848 } 2849 2850 for (setno = startset; setno <= endset; setno++) { 2851 /* Here we need the mutexes for the set to be setup */ 2852 if (md_mn_set_inited[setno] != MDMN_SET_MUTEXES) { 2853 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 2854 } 2855 2856 mutex_lock(&mdmn_busy_mutex[setno]); 2857 /* shall we drain all classes of this set? */ 2858 if (oclass == MD_COMM_ALL_CLASSES) { 2859 for (class = 1; class < MD_MN_NCLASSES; class ++) { 2860 commd_debug(MD_MMV_MISC, 2861 "suspend: suspending set %d, class %d\n", 2862 setno, class); 2863 *retval = mdmn_mark_class_suspended(setno, 2864 class, MDMN_SUSPEND_ALL); 2865 if (*retval == MDMNE_SET_NOT_DRAINED) { 2866 failure++; 2867 } 2868 } 2869 } else { 2870 /* only drain one specific class */ 2871 commd_debug(MD_MMV_MISC, 2872 "suspend: suspending set=%d class=%d\n", 2873 setno, oclass); 2874 *retval = mdmn_mark_class_suspended(setno, oclass, 2875 MDMN_SUSPEND_1); 2876 if (*retval == MDMNE_SET_NOT_DRAINED) { 2877 failure++; 2878 } 2879 } 2880 mutex_unlock(&mdmn_busy_mutex[setno]); 2881 } 2882 /* If one or more sets are not entirely drained, failure is non-zero */ 2883 if (failure != 0) { 2884 *retval = MDMNE_SET_NOT_DRAINED; 2885 commd_debug(MD_MMV_MISC, 2886 "suspend: returning MDMNE_SET_NOT_DRAINED\n"); 2887 } else { 2888 *retval = MDMNE_ACK; 2889 } 2890 2891 return (retval); 2892 } 2893 2894 /* 2895 * mdmn_comm_resume_svc_1(setno, class) 2896 * 2897 * Resume processing messages for a given set. 2898 * This incorporates the repeal of a previous suspend operation. 2899 * 2900 * 1 <= setno < MD_MAXSETS or setno == MD_COMM_ALL_SETS 2901 * 1 <= class < MD_MN_NCLASSES or class == MD_COMM_ALL_CLASSES 2902 * 2903 * If class _is_not_ MD_COMM_ALL_CLASSES, then we simply mark this 2904 * one class as being resumed. 2905 * 2906 * If class _is_ MD_COMM_ALL_CLASSES we resume all classes of this set. 2907 * 2908 * If setno is MD_COMM_ALL_SETS then we perform this on all possible sets. 2909 * 2910 * If both setno is MD_COMM_ALL_SETS and class is MD_COMM_ALL_CLASSES we also 2911 * reset any ABORT flag from the global state. 2912 * 2913 * Returns: 2914 * MDMNE_ACK on sucess (resuming an unlocked set is Ok) 2915 * MDMNE_EINVAL if setno is out of range 2916 * MDMNE_NOT_JOINED if the set is not yet initialized on this node 2917 */ 2918 /* ARGSUSED */ 2919 int * 2920 mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp) 2921 { 2922 int *retval; 2923 set_t startset, endset; 2924 set_t setno = msc->msc_set; 2925 md_mn_msgclass_t oclass = msc->msc_class; 2926 uint_t flags = msc->msc_flags; 2927 md_mn_msgclass_t class; 2928 2929 retval = Malloc(sizeof (int)); 2930 2931 /* check if the global initialization is done */ 2932 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 2933 global_init(); 2934 } 2935 2936 /* is this rpc request coming from the local node ? */ 2937 if (check_license(rqstp, 0) == FALSE) { 2938 xdr_free(xdr_md_mn_set_and_class_t, (caddr_t)msc); 2939 *retval = MDMNE_RPC_FAIL; 2940 return (retval); 2941 } 2942 2943 commd_debug(MD_MMV_MISC, "resume: called for set=%d class=%d\n", 2944 setno, oclass); 2945 2946 /* Perform some range checking */ 2947 if (setno > MD_MAXSETS) { 2948 *retval = MDMNE_EINVAL; 2949 return (retval); 2950 } 2951 2952 if (setno == MD_COMM_ALL_SETS) { 2953 startset = 1; 2954 endset = MD_MAXSETS - 1; 2955 if (oclass == MD_COMM_ALL_CLASSES) { 2956 /* This is the point where we "unabort" the commd */ 2957 commd_debug(MD_MMV_MISC, "resume: resetting ABORT\n"); 2958 md_commd_global_state &= ~MD_CGS_ABORTED; 2959 } 2960 } else { 2961 startset = setno; 2962 endset = setno; 2963 } 2964 2965 for (setno = startset; setno <= endset; setno++) { 2966 2967 /* Here we need the mutexes for the set to be setup */ 2968 if ((md_mn_set_inited[setno] & MDMN_SET_MUTEXES) == 0) { 2969 (void) mdmn_init_set(setno, MDMN_SET_MUTEXES); 2970 } 2971 2972 mutex_lock(&mdmn_busy_mutex[setno]); 2973 2974 if (oclass == MD_COMM_ALL_CLASSES) { 2975 int end_class = 1; 2976 /* 2977 * When SUSPENDing all classes, we go 2978 * from 1 to MD_MN_NCLASSES-1 2979 * The correct reverse action is RESUMing 2980 * from MD_MN_NCLASSES-1 to 1 (or 2) 2981 */ 2982 2983 if (flags & MD_MSCF_DONT_RESUME_CLASS1) { 2984 end_class = 2; 2985 } 2986 2987 /* 2988 * Then mark all classes of this set as no longer 2989 * suspended. This supersedes any previous suspend(1) 2990 * calls and resumes the set entirely. 2991 */ 2992 for (class = MD_MN_NCLASSES - 1; class >= end_class; 2993 class --) { 2994 commd_debug(MD_MMV_MISC, 2995 "resume: resuming set=%d class=%d\n", 2996 setno, class); 2997 mdmn_mark_class_resumed(setno, class, 2998 (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1)); 2999 } 3000 } else { 3001 /* 3002 * In this case only one class is marked as not 3003 * suspended. If a suspend(all) is currently active for 3004 * this set, this class will still be suspended. 3005 * That state will be cleared by a suspend(all) 3006 * (see above) 3007 */ 3008 commd_debug(MD_MMV_MISC, 3009 "resume: resuming set=%d class=%d\n", 3010 setno, oclass); 3011 mdmn_mark_class_resumed(setno, oclass, MDMN_SUSPEND_1); 3012 } 3013 3014 mutex_unlock(&mdmn_busy_mutex[setno]); 3015 } 3016 3017 *retval = MDMNE_ACK; 3018 return (retval); 3019 } 3020 /* ARGSUSED */ 3021 int * 3022 mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp) 3023 { 3024 int *retval; 3025 md_mnnode_desc *node; 3026 set_t setno = *setnop; 3027 3028 retval = Malloc(sizeof (int)); 3029 3030 /* check if the global initialization is done */ 3031 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 3032 global_init(); 3033 } 3034 3035 /* is this rpc request coming from the local node ? */ 3036 if (check_license(rqstp, 0) == FALSE) { 3037 xdr_free(xdr_set_t, (caddr_t)setnop); 3038 *retval = MDMNE_RPC_FAIL; 3039 return (retval); 3040 } 3041 3042 commd_debug(MD_MMV_MISC, "reinit: set=%d\n", setno); 3043 3044 rw_rdlock(&set_desc_rwlock[setno]); 3045 /* 3046 * We assume, that all messages have been suspended previously. 3047 * 3048 * As we are modifying lots of clients here we grab the client_rwlock 3049 * in writer mode. This ensures, no new messages come in. 3050 */ 3051 rw_wrlock(&client_rwlock[setno]); 3052 /* This set is no longer initialized */ 3053 3054 if ((set_descriptor[setno] != NULL) && 3055 (md_mn_set_inited[setno] & MDMN_SET_NODES)) { 3056 /* destroy all rpc clients from this set */ 3057 for (node = set_descriptor[setno]->sd_nodelist; node; 3058 node = node->nd_next) { 3059 mdmn_clnt_destroy(client[setno][node->nd_nodeid]); 3060 if (client[setno][node->nd_nodeid] != (CLIENT *)NULL) { 3061 client[setno][node->nd_nodeid] = (CLIENT *)NULL; 3062 } 3063 } 3064 md_mn_set_inited[setno] &= ~MDMN_SET_NODES; 3065 } 3066 3067 commd_debug(MD_MMV_MISC, "reinit: done init_set(%d)\n", setno); 3068 3069 rw_unlock(&client_rwlock[setno]); 3070 rw_unlock(&set_desc_rwlock[setno]); 3071 *retval = MDMNE_ACK; 3072 return (retval); 3073 } 3074 3075 /* 3076 * This is just an interface for testing purpose. 3077 * Here we can disable single message types. 3078 * If we block a message type, this is valid for all MN sets. 3079 * If a message arrives later, and it's message type is blocked, it will 3080 * be returned immediately with MDMNE_CLASS_LOCKED, which causes the sender to 3081 * resend this message over and over again. 3082 */ 3083 3084 /* ARGSUSED */ 3085 int * 3086 mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp) 3087 { 3088 int *retval; 3089 md_mn_msgtype_t type = mmtl->mmtl_type; 3090 uint_t lock = mmtl->mmtl_lock; 3091 3092 retval = Malloc(sizeof (int)); 3093 3094 /* check if the global initialization is done */ 3095 if ((md_commd_global_state & MD_CGS_INITED) == 0) { 3096 global_init(); 3097 } 3098 3099 /* is this rpc request coming from the local node ? */ 3100 if (check_license(rqstp, 0) == FALSE) { 3101 xdr_free(xdr_md_mn_type_and_lock_t, (caddr_t)mmtl); 3102 *retval = MDMNE_RPC_FAIL; 3103 return (retval); 3104 } 3105 3106 /* Perform some range checking */ 3107 if ((type == 0) || (type >= MD_MN_NMESSAGES)) { 3108 *retval = MDMNE_EINVAL; 3109 return (retval); 3110 } 3111 3112 commd_debug(MD_MMV_MISC, "msglock: type=%d, lock=%d\n", type, lock); 3113 msgtype_lock_state[type] = lock; 3114 3115 *retval = MDMNE_ACK; 3116 return (retval); 3117 } 3118