xref: /titanic_41/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c (revision 80148899834a4078a2bd348504aa2d6de9752837)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <unistd.h>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <netinet/in.h>
31 #include <arpa/inet.h>
32 #include <thread.h>
33 #include "meta.h"
34 #include "mdmn_subr.h"
35 
36 extern int mdmn_init_set(set_t setno, int todo);
37 
38 uint_t mdmn_busy[MD_MAXSETS][MD_MN_NCLASSES];
39 mutex_t	mdmn_busy_mutex[MD_MAXSETS];
40 cond_t	mdmn_busy_cv[MD_MAXSETS];
41 
42 
43 /* the wakeup table for the initiator's side */
44 mdmn_wti_t mdmn_initiator_table[MD_MAXSETS][MD_MN_NCLASSES];
45 
46 /* the wakeup table for the master */
47 mdmn_wtm_t mdmn_master_table[MD_MAXSETS][MD_MN_NCLASSES];
48 
49 /* List of licensed ip addresses */
50 licensed_ip_t   licensed_nodes[NNODES];
51 
52 /* speed up the search for licensed ip addresses */
53 md_mn_nodeid_t maxlicnodes = 0; /* 0 is not a valid node ID */
54 
55 /*
56  * Check if a given set/class combination is currently in use
57  * If in use, returns TRUE
58  * Otherwise returns FALSE
59  *
60  * Must be called with mdmn_busy_mutex held
61  */
62 bool_t
mdmn_is_class_busy(set_t setno,md_mn_msgclass_t class)63 mdmn_is_class_busy(set_t setno, md_mn_msgclass_t class)
64 {
65 	if (mdmn_busy[setno][class] & MDMN_BUSY) {
66 		return (TRUE);
67 	} else {
68 		return (FALSE);
69 	}
70 }
71 
72 /*
73  * Mark a given set/class combination as currently in use
74  * If the class was already in use, returns FALSE
75  * Otherwise returns TRUE
76  *
77  * So mdmn_mark_class_busy can be used like
78  * if (mdmn_mark_class_busy(setno, class) == FALSE)
79  * 	failure;
80  * else
81  *	success;
82  *
83  * Must be called with mdmn_busy_mutex held
84  */
85 bool_t
mdmn_mark_class_busy(set_t setno,md_mn_msgclass_t class)86 mdmn_mark_class_busy(set_t setno, md_mn_msgclass_t class)
87 {
88 	if (mdmn_busy[setno][class] & MDMN_BUSY) {
89 		return (FALSE);
90 	} else {
91 		mdmn_busy[setno][class] |= MDMN_BUSY;
92 		commd_debug(MD_MMV_MISC, "busy: set=%d, class=%d\n",
93 		    setno, class);
94 		return (TRUE);
95 	}
96 }
97 
98 /*
99  * Mark a given set/class combination as currently available
100  * Always succeeds, thus void.
101  *
102  * If this class is marked MDMN_SUSPEND_ALL, we are in the middle of
103  * draining all classes of this set.
104  * We have to mark class+1 as MDMN_SUSPEND_ALL too.
105  * If class+2 wasn't busy, we proceed with class+2, and so on
106  * If any class is busy, we return.
107  * Then the drain process will be continued by the mdmn_mark_class_unbusy() of
108  * that busy class
109  */
110 void
mdmn_mark_class_unbusy(set_t setno,md_mn_msgclass_t class)111 mdmn_mark_class_unbusy(set_t setno, md_mn_msgclass_t class)
112 {
113 	commd_debug(MD_MMV_MISC, "unbusy: set=%d, class=%d\n", setno, class);
114 	mdmn_busy[setno][class] &= ~MDMN_BUSY;
115 	/* something changed, inform threads waiting for that */
116 	(void) cond_signal(&mdmn_busy_cv[setno]);
117 
118 	if ((mdmn_busy[setno][class] & MDMN_SUSPEND_ALL) == 0) {
119 		return;
120 	}
121 
122 	while (++class < MD_MN_NCLASSES) {
123 		commd_debug(MD_MMV_MISC,
124 		    "unbusy: suspending set=%d, class=%d\n", setno, class);
125 		if (mdmn_mark_class_suspended(setno, class, MDMN_SUSPEND_ALL)
126 		    == MDMNE_SET_NOT_DRAINED) {
127 			break;
128 		}
129 	}
130 
131 }
132 
133 
134 /*
135  * Check if a given set/class combination is locked.
136  */
137 bool_t
mdmn_is_class_locked(set_t setno,md_mn_msgclass_t class)138 mdmn_is_class_locked(set_t setno, md_mn_msgclass_t class)
139 {
140 	if (mdmn_busy[setno][class] & MDMN_LOCKED) {
141 		return (TRUE);
142 	} else {
143 		return (FALSE);
144 	}
145 }
146 
147 /*
148  * Mark a given set/class combination as locked.
149  * No checking is done here, so routine can be void.
150  * Locking a locked set/class is ok.
151  *
152  * Must be called with mdmn_busy_mutex held
153  */
154 void
mdmn_mark_class_locked(set_t setno,md_mn_msgclass_t class)155 mdmn_mark_class_locked(set_t setno, md_mn_msgclass_t class)
156 {
157 	mdmn_busy[setno][class] |= MDMN_LOCKED;
158 }
159 
160 /*
161  * Mark a given set/class combination as unlocked.
162  * No checking is done here, so routine can be void.
163  * Unlocking a unlocked set/class is ok.
164  *
165  * Must be called with mdmn_busy_mutex held
166  */
167 void
mdmn_mark_class_unlocked(set_t setno,md_mn_msgclass_t class)168 mdmn_mark_class_unlocked(set_t setno, md_mn_msgclass_t class)
169 {
170 	mdmn_busy[setno][class] &= ~MDMN_LOCKED;
171 }
172 
173 /*
174  * Suspend a set/class combination
175  *
176  * If called during draining all classes of a set susptype is MDMN_SUSPEND_ALL.
177  * If only one class is about to be drained susptype is MDMN_SUSPEND_1.
178  *
179  * Returns:
180  *	MDMNE_ACK if there are no outstanding messages
181  *	MDMNE_SET_NOT_DRAINED otherwise
182  *
183  * Must be called with mdmn_busy_mutex held for this set.
184  */
185 int
mdmn_mark_class_suspended(set_t setno,md_mn_msgclass_t class,uint_t susptype)186 mdmn_mark_class_suspended(set_t setno, md_mn_msgclass_t class, uint_t susptype)
187 {
188 	/*
189 	 * We use the mdmn_busy array to mark this set is suspended.
190 	 */
191 	mdmn_busy[setno][class] |= susptype;
192 
193 	/*
194 	 * If there are outstanding messages for this set/class we
195 	 * return MDMNE_SET_NOT_DRAINED, otherwise we return MDMNE_ACK
196 	 */
197 	if (mdmn_is_class_busy(setno, class) == TRUE) {
198 		return (MDMNE_SET_NOT_DRAINED);
199 	}
200 	return (MDMNE_ACK);
201 }
202 
203 /*
204  * Resume operation for a set/class combination after it was
205  * previously suspended
206  *
207  * If called from mdmn_comm_resume_svc_1 to resume _one_ specific class
208  * then susptype will be MDMN_SUSPEND_1
209  * Otherwise to resume all classes of one set,
210  * then susptype equals (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1)
211  *
212  * Always succeeds, thus void.
213  *
214  * Must be called with mdmn_busy_mutex held for this set.
215  */
216 void
mdmn_mark_class_resumed(set_t setno,md_mn_msgclass_t class,uint_t susptype)217 mdmn_mark_class_resumed(set_t setno, md_mn_msgclass_t class, uint_t susptype)
218 {
219 	/* simply the reverse operation to mdmn_mark_set_drained() */
220 	mdmn_busy[setno][class] &= ~susptype;
221 }
222 
223 /*
224  * Check if a drain command was issued for this set/class combination.
225  *
226  * Must be called with mdmn_busy_mutex held for this set.
227  */
228 bool_t
mdmn_is_class_suspended(set_t setno,md_mn_msgclass_t class)229 mdmn_is_class_suspended(set_t setno, md_mn_msgclass_t class)
230 {
231 	if (mdmn_busy[setno][class] & (MDMN_SUSPEND_ALL | MDMN_SUSPEND_1)) {
232 		return (TRUE);
233 	} else {
234 		return (FALSE);
235 	}
236 }
237 
238 /*
239  * Put a result into the wakeup table for the master
240  * It's ensured that the msg id from the master_table entry and from
241  * result are matching
242  */
243 void
mdmn_set_master_table_res(set_t setno,md_mn_msgclass_t class,md_mn_result_t * res)244 mdmn_set_master_table_res(set_t setno, md_mn_msgclass_t class,
245 				md_mn_result_t  *res)
246 {
247 	mdmn_master_table[setno][class].wtm_result = res;
248 }
249 void
mdmn_set_master_table_id(set_t setno,md_mn_msgclass_t class,md_mn_msgid_t * id)250 mdmn_set_master_table_id(set_t setno, md_mn_msgclass_t class, md_mn_msgid_t *id)
251 {
252 	MSGID_COPY(id, &(mdmn_master_table[setno][class].wtm_id));
253 }
254 
255 void
mdmn_set_master_table_addr(set_t setno,md_mn_msgclass_t class,md_mn_nodeid_t nid)256 mdmn_set_master_table_addr(set_t setno, md_mn_msgclass_t class,
257     md_mn_nodeid_t nid)
258 {
259 	mdmn_master_table[setno][class].wtm_addr = nid;
260 }
261 
262 
263 md_mn_result_t *
mdmn_get_master_table_res(set_t setno,md_mn_msgclass_t class)264 mdmn_get_master_table_res(set_t setno, md_mn_msgclass_t class)
265 {
266 	return (mdmn_master_table[setno][class].wtm_result);
267 }
268 
269 void
mdmn_get_master_table_id(set_t setno,md_mn_msgclass_t class,md_mn_msgid_t * id)270 mdmn_get_master_table_id(set_t setno, md_mn_msgclass_t class, md_mn_msgid_t *id)
271 {
272 	MSGID_COPY(&(mdmn_master_table[setno][class].wtm_id), id);
273 }
274 
275 cond_t *
mdmn_get_master_table_cv(set_t setno,md_mn_msgclass_t class)276 mdmn_get_master_table_cv(set_t setno, md_mn_msgclass_t class)
277 {
278 	return (&(mdmn_master_table[setno][class].wtm_cv));
279 }
280 
281 mutex_t *
mdmn_get_master_table_mx(set_t setno,md_mn_msgclass_t class)282 mdmn_get_master_table_mx(set_t setno, md_mn_msgclass_t class)
283 {
284 	return (&(mdmn_master_table[setno][class].wtm_mx));
285 }
286 
287 md_mn_nodeid_t
mdmn_get_master_table_addr(set_t setno,md_mn_msgclass_t class)288 mdmn_get_master_table_addr(set_t setno, md_mn_msgclass_t class)
289 {
290 	return (mdmn_master_table[setno][class].wtm_addr);
291 }
292 
293 
294 
295 /* here come the functions dealing with the wakeup table for the initiators */
296 
297 
298 void
mdmn_register_initiator_table(set_t setno,md_mn_msgclass_t class,md_mn_msg_t * msg,SVCXPRT * transp)299 mdmn_register_initiator_table(set_t setno, md_mn_msgclass_t class,
300     md_mn_msg_t *msg, SVCXPRT *transp)
301 {
302 	uint_t nnodes	= set_descriptor[setno]->sd_mn_numnodes;
303 	time_t timeout	= mdmn_get_timeout(msg->msg_type);
304 
305 
306 	MSGID_COPY(&(msg->msg_msgid),
307 	    &(mdmn_initiator_table[setno][class].wti_id));
308 	mdmn_initiator_table[setno][class].wti_transp = transp;
309 	mdmn_initiator_table[setno][class].wti_args = (char *)msg;
310 
311 	/*
312 	 * as the point in time where we want to be guaranteed to be woken up
313 	 * again, we chose the
314 	 * current time + nnodes times the timeout value for the message type
315 	 */
316 	mdmn_initiator_table[setno][class].wti_time =
317 	    time((time_t *)NULL) + (nnodes * timeout);
318 }
319 
320 /*
321  * If the set/class combination is currently busy, return MDMNE_CLASS_BUSY
322  * Otherwise return MDMNE_ACK
323  */
324 int
mdmn_check_initiator_table(set_t setno,md_mn_msgclass_t class)325 mdmn_check_initiator_table(set_t setno, md_mn_msgclass_t class)
326 {
327 	if ((mdmn_initiator_table[setno][class].wti_id.mid_nid == ~0u) &&
328 	    (mdmn_initiator_table[setno][class].wti_transp == (SVCXPRT *)NULL))
329 		return (MDMNE_ACK);
330 	return (MDMNE_CLASS_BUSY);
331 }
332 
333 /*
334  * Remove an entry from the initiator table entirely,
335  * This must be done with mutex held.
336  */
337 void
mdmn_unregister_initiator_table(set_t setno,md_mn_msgclass_t class)338 mdmn_unregister_initiator_table(set_t setno, md_mn_msgclass_t class)
339 {
340 	mdmn_initiator_table[setno][class].wti_id.mid_nid = ~0u;
341 	mdmn_initiator_table[setno][class].wti_id.mid_time = 0LL;
342 	mdmn_initiator_table[setno][class].wti_transp = (SVCXPRT *)NULL;
343 	mdmn_initiator_table[setno][class].wti_args = (char *)0;
344 	mdmn_initiator_table[setno][class].wti_time = (time_t)0;
345 }
346 
347 void
mdmn_get_initiator_table_id(set_t setno,md_mn_msgclass_t class,md_mn_msgid_t * mid)348 mdmn_get_initiator_table_id(set_t setno, md_mn_msgclass_t class,
349 				md_mn_msgid_t *mid)
350 {
351 	MSGID_COPY(&(mdmn_initiator_table[setno][class].wti_id), mid);
352 }
353 
354 SVCXPRT *
mdmn_get_initiator_table_transp(set_t setno,md_mn_msgclass_t class)355 mdmn_get_initiator_table_transp(set_t setno, md_mn_msgclass_t class)
356 {
357 	return (mdmn_initiator_table[setno][class].wti_transp);
358 }
359 
360 char *
mdmn_get_initiator_table_args(set_t setno,md_mn_msgclass_t class)361 mdmn_get_initiator_table_args(set_t setno, md_mn_msgclass_t class)
362 {
363 	return (mdmn_initiator_table[setno][class].wti_args);
364 }
365 
366 mutex_t *
mdmn_get_initiator_table_mx(set_t setno,md_mn_msgclass_t class)367 mdmn_get_initiator_table_mx(set_t setno, md_mn_msgclass_t class)
368 {
369 	return (&(mdmn_initiator_table[setno][class].wti_mx));
370 }
371 
372 time_t
mdmn_get_initiator_table_time(set_t setno,md_mn_msgclass_t class)373 mdmn_get_initiator_table_time(set_t setno, md_mn_msgclass_t class)
374 {
375 	return (mdmn_initiator_table[setno][class].wti_time);
376 }
377 
378 extern uint_t	md_commd_global_verb;	/* global bitmask for debug classes */
379 extern FILE	*commdout;		/* debug output file for the commd */
380 extern hrtime_t __savetime;
381 
382 
383 /*
384  * Print debug messages to the terminal or to syslog
385  * commd_debug(MD_MMV_SYSLOG,....) is always printed (and always via syslog),
386  * even if md_commd_global_verb is zero.
387  *
388  * Otherwise the correct bit must be set in the bitmask md_commd_global_verb
389  */
390 void
commd_debug(uint_t debug_class,const char * message,...)391 commd_debug(uint_t debug_class, const char *message, ...)
392 {
393 	va_list ap;
394 
395 	/* Is this a message for syslog? */
396 	if (debug_class == MD_MMV_SYSLOG) {
397 
398 		va_start(ap, message);
399 		(void) vsyslog(LOG_WARNING, message, ap);
400 		va_end(ap);
401 	} else {
402 		/* Is this debug_class set in the global verbosity state?  */
403 		if ((md_commd_global_verb & debug_class) == 0) {
404 			return;
405 		}
406 		/* Is our output file already functioning? */
407 		if (commdout == NULL) {
408 			return;
409 		}
410 		/* Are timestamps activated ? */
411 		if (md_commd_global_verb & MD_MMV_TIMESTAMP) {
412 			/* print time since last TRESET in usecs */
413 			(void) fprintf(commdout, "[%s]",
414 			    meta_print_hrtime(gethrtime() - __savetime));
415 		}
416 		/* Now print the real message */
417 		va_start(ap, message);
418 		(void) vfprintf(commdout, message, ap);
419 		va_end(ap);
420 	}
421 }
422 
423 
424 void
dump_hex(uint_t debug_class,unsigned int * x,int cnt)425 dump_hex(uint_t debug_class, unsigned int *x, int cnt)
426 {
427 	cnt /= sizeof (unsigned int);
428 	while (cnt--) {
429 		commd_debug(debug_class, "0x%8x ", *x++);
430 		if (cnt % 4)
431 			continue;
432 		commd_debug(debug_class, "\n");
433 	}
434 	commd_debug(debug_class, "\n");
435 }
436 
437 /* debug output: dump a message */
438 void
dump_msg(uint_t dbc,char * prefix,md_mn_msg_t * msg)439 dump_msg(uint_t dbc, char *prefix, md_mn_msg_t *msg)
440 {
441 	commd_debug(dbc, "%s &msg	= 0x%x\n", prefix, (int)msg);
442 	commd_debug(dbc, "%s ID	= (%d, 0x%llx-%d)\n", prefix,
443 	    MSGID_ELEMS(msg->msg_msgid));
444 	commd_debug(dbc, "%s sender	= %d\n", prefix, msg->msg_sender);
445 	commd_debug(dbc, "%s flags	= 0x%x\n", prefix, msg->msg_flags);
446 	commd_debug(dbc, "%s setno	= %d\n", prefix, msg->msg_setno);
447 	commd_debug(dbc, "%s recipient  = %d\n", prefix, msg->msg_recipient);
448 	commd_debug(dbc, "%s type	= %d\n", prefix, msg->msg_type);
449 	commd_debug(dbc, "%s size	= %d\n", prefix, msg->msg_event_size);
450 	if (msg->msg_event_size) {
451 		commd_debug(dbc, "%s data	=\n", prefix);
452 		dump_hex(dbc, (unsigned int *)(void *)msg->msg_event_data,
453 		    msg->msg_event_size);
454 	}
455 }
456 
457 /* debug output: dump a result structure */
458 void
dump_result(uint_t dbc,char * prefix,md_mn_result_t * res)459 dump_result(uint_t dbc, char *prefix, md_mn_result_t *res)
460 {
461 	commd_debug(dbc, "%s &res	= 0x%x\n", prefix, (int)res);
462 	commd_debug(dbc, "%s ID	= (%d, 0x%llx-%d)\n", prefix,
463 	    MSGID_ELEMS(res->mmr_msgid));
464 	commd_debug(dbc, "%s setno	= %d\n", prefix, res->mmr_setno);
465 	commd_debug(dbc, "%s type	= %d\n", prefix, res->mmr_msgtype);
466 	commd_debug(dbc, "%s flags	= 0x%x\n", prefix, res->mmr_flags);
467 	commd_debug(dbc, "%s comm_state= %d\n", prefix, res->mmr_comm_state);
468 	commd_debug(dbc, "%s exitval	= %d\n", prefix, res->mmr_exitval);
469 	commd_debug(dbc, "%s out_size	= %d\n", prefix, res->mmr_out_size);
470 	if (res->mmr_out_size)
471 		commd_debug(dbc, "%s out	= %s\n", prefix, res->mmr_out);
472 	commd_debug(dbc, "%s err_size	= %d\n", prefix, res->mmr_err_size);
473 	if (res->mmr_err_size)
474 		commd_debug(dbc, "%s err	= %s\n", prefix, res->mmr_err);
475 }
476 
477 
478 /*
479  * Here we find out, where to store or find the results for a given msg.
480  *
481  * Per set we have a pointer to a three dimensional array:
482  * mct[set] -> mct_mce[NNODES][MD_MN_NCLASSES][MAX_SUBMESSAGES]
483  * So, for every possible node and for every possible class we can store
484  * MAX_SUBMESSAGES results.
485  * the way to find the correct index is
486  *	submessage +
487  *	class * MAX_SUBMESSAGES +
488  *	nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES.
489  *
490  * To find the correct address the index has to be multiplied
491  * by the size of one entry.
492  */
493 static md_mn_mce_t *
mdmn_get_mce_by_msg(md_mn_msg_t * msg)494 mdmn_get_mce_by_msg(md_mn_msg_t *msg)
495 {
496 	set_t	setno = msg->msg_setno;
497 	int	nodeid = msg->msg_msgid.mid_nid;
498 	int	submsg = msg->msg_msgid.mid_smid;
499 	int	mct_index;
500 	off_t	mct_offset;
501 	md_mn_msgclass_t class;
502 
503 	if (mct[setno] != NULL) {
504 		if (mdmn_init_set(setno, MDMN_SET_MCT) != 0) {
505 			return ((md_mn_mce_t *)MDMN_MCT_ERROR);
506 		}
507 	}
508 
509 	if (submsg == 0) {
510 		class = mdmn_get_message_class(msg->msg_type);
511 	} else {
512 		class = msg->msg_msgid.mid_oclass;
513 	}
514 
515 	mct_index = submsg + class * MAX_SUBMESSAGES +
516 	    nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES;
517 
518 	mct_offset = mct_index * sizeof (md_mn_mce_t);
519 
520 	/* LINTED Pointer alignment */
521 	return ((md_mn_mce_t *)((caddr_t)(mct[setno]) + mct_offset));
522 
523 	/*
524 	 * the lint clean version would be:
525 	 * return (&(mct[setno]->mct_mce[0][0][0]) + mct_index);
526 	 * :-)
527 	 */
528 }
529 
530 /*
531  * mdmn_mark_completion(msg, result, flag)
532  * Stores the result of this message into the mmaped memory MCT[setno]
533  * In case the same message comes along a second time we will know that
534  * this message has already been processed and we can deliver the
535  * results immediately.
536  *
537  * Before a message handler is called, the message in the MCT is flagged
538  * as currently being processed (flag == MDMN_MCT_IN_PROGRESS).
539  * This we need so we don't start a second handler for the same message.
540  *
541  * After a message handler is completed, this routine is called with
542  * flag == MDMN_MCT_DONE and the appropriate result that we store in the MCT.
543  * As MCT[setno] is memory mapped to disks, this information is persistent
544  * even across a crash of the commd.
545  * It doesn't have to be persistent across a reboot, though.
546  *
547  * Returns MDMN_MCT_DONE in case of success
548  * Returns MDMN_MCT_ERROR in case of error creating the mct
549  */
550 int
mdmn_mark_completion(md_mn_msg_t * msg,md_mn_result_t * result,uint_t flag)551 mdmn_mark_completion(md_mn_msg_t *msg, md_mn_result_t *result, uint_t flag)
552 {
553 	md_mn_mce_t	*mce;
554 	uint_t		offset_in_page;
555 
556 	mce = mdmn_get_mce_by_msg(msg);
557 	if (mce == (md_mn_mce_t *)-1) {
558 		return (MDMN_MCT_ERROR);
559 	}
560 	offset_in_page = (uint_t)(caddr_t)mce % sysconf(_SC_PAGESIZE);
561 
562 	(void) memset(mce, 0, sizeof (md_mn_mce_t));
563 
564 	MSGID_COPY(&msg->msg_msgid, &mce->mce_result.mmr_msgid);
565 	if (flag == MDMN_MCT_IN_PROGRESS) {
566 		mce->mce_flags = MDMN_MCT_IN_PROGRESS;
567 		goto mmc_out;
568 	}
569 
570 	/*
571 	 * In case the message flags indicate that the result should not be
572 	 * stored in the MCT, we return a MDMN_MCT_NOT_DONE,
573 	 * so the message will be processed at any rate,
574 	 * even if we process this message twice.
575 	 * this makes sense if the result of the message is a dynamic status
576 	 * and might have changed meanwhile.
577 	 */
578 	if (msg->msg_flags & MD_MSGF_NO_MCT) {
579 		return (MDMN_MCT_DONE);
580 	}
581 
582 	/* This msg is no longer in progress */
583 	mce->mce_flags = MDMN_MCT_DONE;
584 
585 	mce->mce_result.mmr_msgtype	    = result->mmr_msgtype;
586 	mce->mce_result.mmr_setno	    = result->mmr_setno;
587 	mce->mce_result.mmr_flags	    = result->mmr_flags;
588 	mce->mce_result.mmr_sender	    = result->mmr_sender;
589 	mce->mce_result.mmr_failing_node    = result->mmr_failing_node;
590 	mce->mce_result.mmr_comm_state	    = result->mmr_comm_state;
591 	mce->mce_result.mmr_exitval	    = result->mmr_exitval;
592 
593 	/* if mmr_exitval is zero, we store stdout, otherwise stderr */
594 	if (result->mmr_exitval == 0) {
595 		if (result->mmr_out_size > 0) {
596 			(void) memcpy(mce->mce_data, result->mmr_out,
597 			    result->mmr_out_size);
598 			mce->mce_result.mmr_out_size = result->mmr_out_size;
599 		}
600 	} else {
601 		if (result->mmr_err_size > 0) {
602 			mce->mce_result.mmr_err_size = result->mmr_err_size;
603 			(void) memcpy(mce->mce_data, result->mmr_err,
604 			    result->mmr_err_size);
605 		}
606 	}
607 
608 	dump_result(MD_MMV_PROC_S, "mdmn_mark_completion1", result);
609 
610 mmc_out:
611 	/* now flush this entry to disk */
612 	(void) msync((caddr_t)mce - offset_in_page,
613 	    sizeof (md_mn_mce_t) + offset_in_page, MS_SYNC);
614 	return (MDMN_MCT_DONE);
615 }
616 
617 /*
618  * mdmn_check_completion(msg, resultp)
619  * checks if msg has already been processed on this node, and if so copies
620  * the stored result to resultp.
621  *
622  * returns MDMN_MCT_DONE and the result filled out acurately in case the
623  *		msg has already been processed before
624  * returns MDMN_MCT_NOT_DONE if the message has not been processed before
625  * returns MDMN_MCT_IN_PROGRESS if the message is currently being processed
626  *	This can only occur on a slave node.
627  * return MDMN_MCT_ERROR in case of error creating the mct
628  */
629 int
mdmn_check_completion(md_mn_msg_t * msg,md_mn_result_t * result)630 mdmn_check_completion(md_mn_msg_t *msg, md_mn_result_t *result)
631 {
632 	md_mn_mce_t	*mce;
633 	size_t		outsize;
634 	size_t		errsize;
635 
636 	mce = mdmn_get_mce_by_msg(msg);
637 	if (mce == (md_mn_mce_t *)MDMN_MCT_ERROR) {
638 		return (MDMN_MCT_ERROR); /* what to do in that case ? */
639 	}
640 	if (MSGID_CMP(&(msg->msg_msgid), &(mce->mce_result.mmr_msgid))) {
641 		/* is the message completed, or in progress? */
642 		if (mce->mce_flags & MDMN_MCT_IN_PROGRESS) {
643 			return (MDMN_MCT_IN_PROGRESS);
644 		}
645 		/*
646 		 * See comment on MD_MSGF_NO_MCT above, if this flag is set
647 		 * for a message no result was stored and so the message has
648 		 * to be processed no matter if this is the 2nd time then.
649 		 */
650 		if (msg->msg_flags & MD_MSGF_NO_MCT) {
651 			return (MDMN_MCT_NOT_DONE);
652 		}
653 
654 		/* Paranoia check: mce_flags must be MDMN_MCT_DONE here */
655 		if ((mce->mce_flags & MDMN_MCT_DONE) == 0) {
656 			commd_debug(MD_MMV_ALL,
657 			    "mdmn_check_completion: msg not done and not in "
658 			    "progress! ID = (%d, 0x%llx-%d)\n",
659 			    MSGID_ELEMS(msg->msg_msgid));
660 			return (MDMN_MCT_NOT_DONE);
661 		}
662 		/*
663 		 * Already processed.
664 		 * Copy saved results data;
665 		 * return only a pointer to any output.
666 		 */
667 		MSGID_COPY(&(mce->mce_result.mmr_msgid), &result->mmr_msgid);
668 		result->mmr_msgtype	    = mce->mce_result.mmr_msgtype;
669 		result->mmr_setno	    = mce->mce_result.mmr_setno;
670 		result->mmr_flags	    = mce->mce_result.mmr_flags;
671 		result->mmr_sender	    = mce->mce_result.mmr_sender;
672 		result->mmr_failing_node    = mce->mce_result.mmr_failing_node;
673 		result->mmr_comm_state	    = mce->mce_result.mmr_comm_state;
674 		result->mmr_exitval	    = mce->mce_result.mmr_exitval;
675 		result->mmr_err		    = NULL;
676 		result->mmr_out		    = NULL;
677 		outsize = result->mmr_out_size = mce->mce_result.mmr_out_size;
678 		errsize = result->mmr_err_size = mce->mce_result.mmr_err_size;
679 		/*
680 		 * if the exit val is zero only stdout was stored (if any)
681 		 * otherwise only stderr was stored (if any)
682 		 */
683 		if (result->mmr_exitval == 0) {
684 			if (outsize != 0) {
685 				result->mmr_out = Zalloc(outsize);
686 				(void) memcpy(result->mmr_out, mce->mce_data,
687 				    outsize);
688 			}
689 		} else {
690 			if (errsize != 0) {
691 				result->mmr_err = Zalloc(errsize);
692 				(void) memcpy(result->mmr_err, mce->mce_data,
693 				    errsize);
694 			}
695 		}
696 		commd_debug(MD_MMV_MISC,
697 		    "mdmn_check_completion: msg already processed \n");
698 		dump_result(MD_MMV_MISC, "mdmn_check_completion", result);
699 		return (MDMN_MCT_DONE);
700 	}
701 	commd_debug(MD_MMV_MISC,
702 	    "mdmn_check_completion: msg not yet processed\n");
703 	return (MDMN_MCT_NOT_DONE);
704 }
705 
706 
707 
708 /*
709  * check_license(rqstp, chknid)
710  *
711  * Is this RPC request sent from a licensed host?
712  *
713  * If chknid is non-zero, the caller of check_license() knows the ID of
714  * the sender. Then we check just the one entry of licensed_nodes[]
715  *
716  * If chknid is zero, the sender is not known. In that case the sender must be
717  * the local node.
718  *
719  * If the host is licensed, return TRUE, else return FALSE
720  */
721 bool_t
check_license(struct svc_req * rqstp,md_mn_nodeid_t chknid)722 check_license(struct svc_req *rqstp, md_mn_nodeid_t chknid)
723 {
724 	char		buf[INET6_ADDRSTRLEN];
725 	void		*caller = NULL;
726 	in_addr_t	caller_ipv4;
727 	in6_addr_t	caller_ipv6;
728 	struct sockaddr	*ca;
729 
730 
731 	ca = (struct sockaddr *)(void *)svc_getrpccaller(rqstp->rq_xprt)->buf;
732 
733 	if (ca->sa_family == AF_INET) {
734 		caller_ipv4 =
735 		    ((struct sockaddr_in *)(void *)ca)->sin_addr.s_addr;
736 		caller = (void *)&caller_ipv4;
737 
738 		if (chknid == 0) {
739 			/* check against local node */
740 			if (caller_ipv4 == htonl(INADDR_LOOPBACK)) {
741 				return (TRUE);
742 
743 			}
744 		} else {
745 			/* check against one specific node */
746 			if ((caller_ipv4 == licensed_nodes[chknid].lip_ipv4) &&
747 			    (licensed_nodes[chknid].lip_family == AF_INET)) {
748 				return (TRUE);
749 			} else {
750 				commd_debug(MD_MMV_MISC,
751 				    "Bad attempt from %x ln[%d]=%x\n",
752 				    caller_ipv4, chknid,
753 				    licensed_nodes[chknid].lip_ipv4);
754 			}
755 		}
756 	} else if (ca->sa_family == AF_INET6) {
757 		caller_ipv6 = ((struct sockaddr_in6 *)(void *)ca)->sin6_addr;
758 		caller = (void *)&caller_ipv6;
759 
760 		if (chknid == 0) {
761 			/* check against local node */
762 			if (IN6_IS_ADDR_LOOPBACK(&caller_ipv6)) {
763 				return (TRUE);
764 
765 			}
766 		} else {
767 			/* check against one specific node */
768 			if (IN6_ARE_ADDR_EQUAL(&caller_ipv6,
769 			    &(licensed_nodes[chknid].lip_ipv6)) &&
770 			    (licensed_nodes[chknid].lip_family == AF_INET6)) {
771 				return (TRUE);
772 			}
773 		}
774 	}
775 	/* if  we are here, we were contacted by an unlicensed node */
776 	commd_debug(MD_MMV_SYSLOG,
777 	    "Bad attempt to contact rpc.mdcommd from %s\n",
778 	    caller ?
779 	    inet_ntop(ca->sa_family, caller, buf, INET6_ADDRSTRLEN) :
780 	    "unknown");
781 
782 	return (FALSE);
783 }
784 
785 /*
786  * Add a node to the list of licensed nodes.
787  *
788  * Only IPv4 is currently supported.
789  * for IPv6, we need to change md_mnnode_desc.
790  */
791 void
add_license(md_mnnode_desc * node)792 add_license(md_mnnode_desc *node)
793 {
794 	md_mn_nodeid_t nid = node->nd_nodeid;
795 	char		buf[INET6_ADDRSTRLEN];
796 
797 	/*
798 	 * If this node is not yet licensed, do it now.
799 	 * For now only IPv4 addresses are supported.
800 	 */
801 	commd_debug(MD_MMV_MISC, "add_lic(%s): ln[%d]=%s, lnc[%d]=%d\n",
802 	    node->nd_priv_ic, nid,
803 	    inet_ntop(AF_INET, (void *)&licensed_nodes[nid].lip_ipv4,
804 	    buf, INET6_ADDRSTRLEN), nid, licensed_nodes[nid].lip_cnt);
805 
806 	if (licensed_nodes[nid].lip_ipv4 == (in_addr_t)0) {
807 		licensed_nodes[nid].lip_family = AF_INET; /* IPv4 */
808 		licensed_nodes[nid].lip_ipv4 = inet_addr(node->nd_priv_ic);
809 		/* keep track of the last entry for faster search */
810 		if (nid > maxlicnodes)
811 			maxlicnodes = nid;
812 
813 	}
814 	/* in any case bump up the reference count */
815 	licensed_nodes[nid].lip_cnt++;
816 }
817 
818 /*
819  * lower the reference count for one node.
820  * If that drops to zero, remove the node from the list of licensed nodes
821  *
822  * Only IPv4 is currently supported.
823  * for IPv6, we need to change md_mnnode_desc.
824  */
825 void
rem_license(md_mnnode_desc * node)826 rem_license(md_mnnode_desc *node)
827 {
828 	md_mn_nodeid_t nid = node->nd_nodeid;
829 	char		buf[INET6_ADDRSTRLEN];
830 
831 	commd_debug(MD_MMV_MISC, "rem_lic(%s): ln[%d]=%s, lnc[%d]=%d\n",
832 	    node->nd_priv_ic, nid,
833 	    inet_ntop(AF_INET, (void *)&licensed_nodes[nid].lip_ipv4, buf,
834 	    INET6_ADDRSTRLEN), nid, licensed_nodes[nid].lip_cnt);
835 
836 	assert(licensed_nodes[nid].lip_cnt > 0);
837 
838 	/*
839 	 * If this was the last reference to that node, it's license expires
840 	 * For now only IPv4 addresses are supported.
841 	 */
842 	if (--licensed_nodes[nid].lip_cnt == 0) {
843 		licensed_nodes[nid].lip_ipv4 = (in_addr_t)0;
844 	}
845 }
846