xref: /titanic_51/usr/src/uts/common/os/msg.c (revision 7f0b8309074a5d8e9f9d8ffe7aad7bb0b1ee6b1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 /*
31  * Inter-Process Communication Message Facility.
32  *
33  * See os/ipc.c for a description of common IPC functionality.
34  *
35  * Resource controls
36  * -----------------
37  *
38  * Control:      zone.max-msg-ids (rc_zone_msgmni)
39  * Description:  Maximum number of message queue ids allowed a zone.
40  *
41  *   When msgget() is used to allocate a message queue, one id is
42  *   allocated.  If the id allocation doesn't succeed, msgget() fails
43  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
44  *   the id is deallocated.
45  *
46  * Control:      project.max-msg-ids (rc_project_msgmni)
47  * Description:  Maximum number of message queue ids allowed a project.
48  *
49  *   When msgget() is used to allocate a message queue, one id is
50  *   allocated.  If the id allocation doesn't succeed, msgget() fails
51  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
52  *   the id is deallocated.
53  *
54  * Control:      process.max-msg-qbytes (rc_process_msgmnb)
55  * Description:  Maximum number of bytes of messages on a message queue.
56  *
57  *   When msgget() successfully allocates a message queue, the minimum
58  *   enforced value of this limit is used to initialize msg_qbytes.
59  *
60  * Control:      process.max-msg-messages (rc_process_msgtql)
61  * Description:  Maximum number of messages on a message queue.
62  *
63  *   When msgget() successfully allocates a message queue, the minimum
64  *   enforced value of this limit is used to initialize a per-queue
65  *   limit on the number of messages.
66  */
67 
68 #include <sys/types.h>
69 #include <sys/t_lock.h>
70 #include <sys/param.h>
71 #include <sys/cred.h>
72 #include <sys/user.h>
73 #include <sys/proc.h>
74 #include <sys/time.h>
75 #include <sys/ipc.h>
76 #include <sys/ipc_impl.h>
77 #include <sys/msg.h>
78 #include <sys/msg_impl.h>
79 #include <sys/list.h>
80 #include <sys/systm.h>
81 #include <sys/sysmacros.h>
82 #include <sys/cpuvar.h>
83 #include <sys/kmem.h>
84 #include <sys/ddi.h>
85 #include <sys/errno.h>
86 #include <sys/cmn_err.h>
87 #include <sys/debug.h>
88 #include <sys/project.h>
89 #include <sys/modctl.h>
90 #include <sys/syscall.h>
91 #include <sys/policy.h>
92 #include <sys/zone.h>
93 
94 #include <c2/audit.h>
95 
96 /*
97  * The following tunables are obsolete.  Though for compatibility we
98  * still read and interpret msginfo_msgmnb, msginfo_msgmni, and
99  * msginfo_msgtql (see os/project.c and os/rctl_proc.c), the preferred
100  * mechanism for administrating the IPC Message facility is through the
101  * resource controls described at the top of this file.
102  */
103 size_t	msginfo_msgmax = 2048;	/* (obsolete) */
104 size_t	msginfo_msgmnb = 4096;	/* (obsolete) */
105 int	msginfo_msgmni = 50;	/* (obsolete) */
106 int	msginfo_msgtql = 40;	/* (obsolete) */
107 int	msginfo_msgssz = 8;	/* (obsolete) */
108 int	msginfo_msgmap = 0;	/* (obsolete) */
109 ushort_t msginfo_msgseg = 1024;	/* (obsolete) */
110 
111 extern rctl_hndl_t rc_zone_msgmni;
112 extern rctl_hndl_t rc_project_msgmni;
113 extern rctl_hndl_t rc_process_msgmnb;
114 extern rctl_hndl_t rc_process_msgtql;
115 static ipc_service_t *msq_svc;
116 static zone_key_t msg_zone_key;
117 
118 static void msg_dtor(kipc_perm_t *);
119 static void msg_rmid(kipc_perm_t *);
120 static void msg_remove_zone(zoneid_t, void *);
121 
122 /*
123  * Module linkage information for the kernel.
124  */
125 static ssize_t msgsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2,
126 	uintptr_t a4, uintptr_t a5);
127 
128 static struct sysent ipcmsg_sysent = {
129 	6,
130 #ifdef	_LP64
131 	SE_ARGC | SE_NOUNLOAD | SE_64RVAL,
132 #else
133 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
134 #endif
135 	(int (*)())msgsys
136 };
137 
138 #ifdef	_SYSCALL32_IMPL
139 static ssize32_t msgsys32(int opcode, uint32_t a0, uint32_t a1, uint32_t a2,
140 	uint32_t a4, uint32_t a5);
141 
142 static struct sysent ipcmsg_sysent32 = {
143 	6,
144 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
145 	(int (*)())msgsys32
146 };
147 #endif	/* _SYSCALL32_IMPL */
148 
149 static struct modlsys modlsys = {
150 	&mod_syscallops, "System V message facility", &ipcmsg_sysent
151 };
152 
153 #ifdef _SYSCALL32_IMPL
154 static struct modlsys modlsys32 = {
155 	&mod_syscallops32, "32-bit System V message facility", &ipcmsg_sysent32
156 };
157 #endif
158 
159 /*
160  *      Big Theory statement for message queue correctness
161  *
162  * The msgrcv and msgsnd functions no longer uses cv_broadcast to wake up
163  * receivers who are waiting for an event.  Using the cv_broadcast method
164  * resulted in negative scaling when the number of waiting receivers are large
165  * (the thundering herd problem).  Instead, the receivers waiting to receive a
166  * message are now linked in a queue-like fashion and awaken one at a time in
167  * a controlled manner.
168  *
169  * Receivers can block on two different classes of waiting list:
170  *    1) "sendwait" list, which is the more complex list of the two.  The
171  *	  receiver will be awakened by a sender posting a new message.  There
172  *	  are two types of "sendwait" list used:
173  *		a) msg_wait_snd: handles all receivers who are looking for
174  *		   a message type >= 0, but was unable to locate a match.
175  *
176  *		   slot 0: reserved for receivers that have designated they
177  *			   will take any message type.
178  *		   rest:   consist of receivers requesting a specific type
179  *			   but the type was not present.  The entries are
180  *			   hashed into a bucket in an attempt to keep
181  *			   any list search relatively short.
182  * 		b) msg_wait_snd_ngt: handles all receivers that have designated
183  *		   a negative message type. Unlike msg_wait_snd, the hash bucket
184  *		   serves a range of negative message types (-1 to -5, -6 to -10
185  *		   and so forth), where the last bucket is reserved for all the
186  *		   negative message types that hash outside of MSG_MAX_QNUM - 1.
187  *		   This is done this way to simplify the operation of locating a
188  *		   negative message type.
189  *
190  *    2) "copyout" list, where the receiver is awakened by another
191  *	 receiver after a message is copied out.  This is a linked list
192  *	 of waiters that are awakened one at a time.  Although the solution is
193  *	 not optimal, the complexity that would be added in for waking
194  *	 up the right entry far exceeds any potential pay back (too many
195  *	 correctness and corner case issues).
196  *
197  * The lists are doubly linked.  In the case of the "sendwait"
198  * list, this allows the thread to remove itself from the list without having
199  * to traverse the list.  In the case of the "copyout" list it simply allows
200  * us to use common functions with the "sendwait" list.
201  *
202  * To make sure receivers are not hung out to dry, we must guarantee:
203  *    1. If any queued message matches any receiver, then at least one
204  *       matching receiver must be processing the request.
205  *    2. Blocking on the copyout queue is only temporary while messages
206  *	 are being copied out.  The process is guaranted to wakeup
207  *	 when it gets to front of the queue (copyout is a FIFO).
208  *
209  * Rules for blocking and waking up:
210  *   1. A receiver entering msgrcv must examine all messages for a match
211  *      before blocking on a sendwait queue.
212  *   2. If the receiver blocks because the message it chose is already
213  *	being copied out, then when it wakes up needs to start start
214  *	checking the messages from the beginning.
215  *   3) When ever a process returns from msgrcv for any reason, if it
216  *	had attempted to copy a message or blocked waiting for a copy
217  *	to complete it needs to wakeup the next receiver blocked on
218  *	a copy out.
219  *   4) When a message is sent, the sender selects a process waiting
220  *	for that type of message.  This selection process rotates between
221  *	receivers types of 0, negative and positive to prevent starvation of
222  *	any one particular receiver type.
223  *   5) The following are the scenarios for processes that are awakened
224  *	by a msgsnd:
225  *		a) The process finds the message and is able to copy
226  *		   it out.  Once complete, the process returns.
227  *		b) The message that was sent that triggered the wakeup is no
228  *		   longer available (another process found the message first).
229  *		   We issue a wakeup on copy queue and then go back to
230  *		   sleep waiting for another matching message to be sent.
231  *		c) The message that was supposed to be processed was
232  *		   already serviced by another process.  However a different
233  *		   message is present which we can service.  The message
234  *		   is copied and the process returns.
235  *		d) The message is found, but some sort of error occurs that
236  *		   prevents the message from being copied.  The receiver
237  *		   wakes up the next sender that can service this message
238  *		   type and returns an error to the caller.
239  *		e) The message is found, but it is marked as being copied
240  *		   out.  The receiver then goes to sleep on the copyout
241  *		   queue where it will be awakened again sometime in the future.
242  *
243  *
244  *   6) Whenever a message is found that matches the message type designated,
245  * 	but is being copied out we have to block on the copyout queue.
246  *	After process copying finishes the copy out, it  must wakeup (either
247  *	directly or indirectly) all receivers who blocked on its copyout,
248  *	so they are guaranteed a chance to examine the remaining messages.
249  *	This is implemented via a chain of wakeups: Y wakes X, who wakes Z,
250  *	and so on.  The chain cannot be broken.  This leads to the following
251  *	cases:
252  *		a) A receiver is finished copying the message (or encountered)
253  *		   an error), the first entry on the copyout queue is woken
254  *		   up.
255  *		b) When the receiver is woken up, it attempts to locate
256  *		   a message type match.
257  *		c) If a message type is found and
258  *			-- MSG_RCVCOPY flag is not set, the message is
259  *			   marked for copying out.  Regardless of the copyout
260  *			   success the next entry on the copyout queue is
261  *			   awakened and the operation is completed.
262  *			-- MSG_RCVCOPY is set, we simply go back to sleep again
263  *			   on the copyout queue.
264  *		d) If the message type is not found then we wakeup the next
265  *		   process on the copyout queue.
266  *   7) If a msgsnd is unable to complete for of any of the following reasons
267  *	  a) the msgq has no space for the message
268  *	  b) the maximum number of messages allowed has been reached
269  *      then one of two things happen:
270  *	  1) If the passed in msg_flag has IPC_NOWAIT set, then
271  *	     an error is returned.
272  *	  2) The IPC_NOWAIT bit is not set in msg_flag, then the
273  *	     the thread is placed to sleep until the request can be
274  *	     serviced.
275  *   8) When waking a thread waiting to send a message, a check is done to
276  *      verify that the operation being asked for by the thread will complete.
277  *      This decision making process is done in a loop where the oldest request
278  *      is checked first. The search will continue until there is no more
279  *	room on the msgq or we have checked all the waiters.
280  */
281 
282 static uint_t msg_type_hash(long);
283 static int msgq_check_err(kmsqid_t *qp, int cvres);
284 static int msg_rcvq_sleep(list_t *, msgq_wakeup_t *, kmutex_t **,
285     kmsqid_t *);
286 static int msg_copyout(kmsqid_t *, long, kmutex_t **, size_t *, size_t,
287     struct msg *, struct ipcmsgbuf *, int);
288 static void msg_rcvq_wakeup_all(list_t *);
289 static void msg_wakeup_senders(kmsqid_t *);
290 static void msg_wakeup_rdr(kmsqid_t *, msg_select_t **, long);
291 static msgq_wakeup_t *msg_fnd_any_snd(kmsqid_t *, int, long);
292 static msgq_wakeup_t *msg_fnd_any_rdr(kmsqid_t *, int, long);
293 static msgq_wakeup_t *msg_fnd_neg_snd(kmsqid_t *, int, long);
294 static msgq_wakeup_t *msg_fnd_spc_snd(kmsqid_t *, int, long);
295 static struct msg *msgrcv_lookup(kmsqid_t *, long);
296 
297 msg_select_t msg_fnd_sndr[] = {
298 	{ msg_fnd_any_snd, &msg_fnd_sndr[1] },
299 	{ msg_fnd_spc_snd, &msg_fnd_sndr[2] },
300 	{ msg_fnd_neg_snd, &msg_fnd_sndr[0] }
301 };
302 
303 msg_select_t msg_fnd_rdr[1] = {
304 	{ msg_fnd_any_rdr, &msg_fnd_rdr[0] },
305 };
306 
307 static struct modlinkage modlinkage = {
308 	MODREV_1,
309 	&modlsys,
310 #ifdef _SYSCALL32_IMPL
311 	&modlsys32,
312 #endif
313 	NULL
314 };
315 
316 #define	MSG_SMALL_INIT (size_t)-1
317 int
318 _init(void)
319 {
320 	int result;
321 
322 	msq_svc = ipcs_create("msqids", rc_project_msgmni, rc_zone_msgmni,
323 	    sizeof (kmsqid_t), msg_dtor, msg_rmid, AT_IPC_MSG,
324 	    offsetof(ipc_rqty_t, ipcq_msgmni));
325 	zone_key_create(&msg_zone_key, NULL, msg_remove_zone, NULL);
326 
327 	if ((result = mod_install(&modlinkage)) == 0)
328 		return (0);
329 
330 	(void) zone_key_delete(msg_zone_key);
331 	ipcs_destroy(msq_svc);
332 
333 	return (result);
334 }
335 
336 int
337 _fini(void)
338 {
339 	return (EBUSY);
340 }
341 
342 int
343 _info(struct modinfo *modinfop)
344 {
345 	return (mod_info(&modlinkage, modinfop));
346 }
347 
348 static void
349 msg_dtor(kipc_perm_t *perm)
350 {
351 	kmsqid_t *qp = (kmsqid_t *)perm;
352 	int		ii;
353 
354 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
355 		ASSERT(list_is_empty(&qp->msg_wait_snd[ii]));
356 		ASSERT(list_is_empty(&qp->msg_wait_snd_ngt[ii]));
357 		list_destroy(&qp->msg_wait_snd[ii]);
358 		list_destroy(&qp->msg_wait_snd_ngt[ii]);
359 	}
360 	ASSERT(list_is_empty(&qp->msg_cpy_block));
361 	ASSERT(list_is_empty(&qp->msg_wait_rcv));
362 	list_destroy(&qp->msg_cpy_block);
363 	ASSERT(qp->msg_snd_cnt == 0);
364 	ASSERT(qp->msg_cbytes == 0);
365 	list_destroy(&qp->msg_list);
366 	list_destroy(&qp->msg_wait_rcv);
367 }
368 
369 
370 #define	msg_hold(mp)	(mp)->msg_copycnt++
371 
372 /*
373  * msg_rele - decrement the reference count on the message.  When count
374  * reaches zero, free message header and contents.
375  */
376 static void
377 msg_rele(struct msg *mp)
378 {
379 	ASSERT(mp->msg_copycnt > 0);
380 	if (mp->msg_copycnt-- == 1) {
381 		if (mp->msg_addr)
382 			kmem_free(mp->msg_addr, mp->msg_size);
383 		kmem_free(mp, sizeof (struct msg));
384 	}
385 }
386 
387 /*
388  * msgunlink - Unlink msg from queue, decrement byte count and wake up anyone
389  * waiting for free bytes on queue.
390  *
391  * Called with queue locked.
392  */
393 static void
394 msgunlink(kmsqid_t *qp, struct msg *mp)
395 {
396 	list_remove(&qp->msg_list, mp);
397 	qp->msg_qnum--;
398 	qp->msg_cbytes -= mp->msg_size;
399 	msg_rele(mp);
400 
401 	/* Wake up waiting writers */
402 	msg_wakeup_senders(qp);
403 }
404 
405 static void
406 msg_rmid(kipc_perm_t *perm)
407 {
408 	kmsqid_t *qp = (kmsqid_t *)perm;
409 	struct msg *mp;
410 	int		ii;
411 
412 
413 	while ((mp = list_head(&qp->msg_list)) != NULL)
414 		msgunlink(qp, mp);
415 	ASSERT(qp->msg_cbytes == 0);
416 
417 	/*
418 	 * Wake up everyone who is in a wait state of some sort
419 	 * for this message queue.
420 	 */
421 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
422 		msg_rcvq_wakeup_all(&qp->msg_wait_snd[ii]);
423 		msg_rcvq_wakeup_all(&qp->msg_wait_snd_ngt[ii]);
424 	}
425 	msg_rcvq_wakeup_all(&qp->msg_cpy_block);
426 	msg_rcvq_wakeup_all(&qp->msg_wait_rcv);
427 }
428 
429 /*
430  * msgctl system call.
431  *
432  * gets q lock (via ipc_lookup), releases before return.
433  * may call users of msg_lock
434  */
435 static int
436 msgctl(int msgid, int cmd, void *arg)
437 {
438 	STRUCT_DECL(msqid_ds, ds);		/* SVR4 queue work area */
439 	kmsqid_t		*qp;		/* ptr to associated q */
440 	int			error;
441 	struct	cred		*cr;
442 	model_t	mdl = get_udatamodel();
443 	struct msqid_ds64	ds64;
444 	kmutex_t		*lock;
445 	proc_t			*pp = curproc;
446 
447 	STRUCT_INIT(ds, mdl);
448 	cr = CRED();
449 
450 	/*
451 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
452 	 */
453 	switch (cmd) {
454 	case IPC_SET:
455 		if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds)))
456 			return (set_errno(EFAULT));
457 		break;
458 
459 	case IPC_SET64:
460 		if (copyin(arg, &ds64, sizeof (struct msqid_ds64)))
461 			return (set_errno(EFAULT));
462 		break;
463 
464 	case IPC_RMID:
465 		if (error = ipc_rmid(msq_svc, msgid, cr))
466 			return (set_errno(error));
467 		return (0);
468 	}
469 
470 	/*
471 	 * get msqid_ds for this msgid
472 	 */
473 	if ((lock = ipc_lookup(msq_svc, msgid, (kipc_perm_t **)&qp)) == NULL)
474 		return (set_errno(EINVAL));
475 
476 	switch (cmd) {
477 	case IPC_SET:
478 		if (STRUCT_FGET(ds, msg_qbytes) > qp->msg_qbytes &&
479 		    secpolicy_ipc_config(cr) != 0) {
480 			mutex_exit(lock);
481 			return (set_errno(EPERM));
482 		}
483 		if (error = ipcperm_set(msq_svc, cr, &qp->msg_perm,
484 		    &STRUCT_BUF(ds)->msg_perm, mdl)) {
485 			mutex_exit(lock);
486 			return (set_errno(error));
487 		}
488 		qp->msg_qbytes = STRUCT_FGET(ds, msg_qbytes);
489 		qp->msg_ctime = gethrestime_sec();
490 		break;
491 
492 	case IPC_STAT:
493 		if (error = ipcperm_access(&qp->msg_perm, MSG_R, cr)) {
494 			mutex_exit(lock);
495 			return (set_errno(error));
496 		}
497 
498 		if (qp->msg_rcv_cnt)
499 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
500 		if (qp->msg_snd_cnt)
501 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
502 		ipcperm_stat(&STRUCT_BUF(ds)->msg_perm, &qp->msg_perm, mdl);
503 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
504 		STRUCT_FSETP(ds, msg_first, NULL); 	/* kernel addr */
505 		STRUCT_FSETP(ds, msg_last, NULL);
506 		STRUCT_FSET(ds, msg_cbytes, qp->msg_cbytes);
507 		STRUCT_FSET(ds, msg_qnum, qp->msg_qnum);
508 		STRUCT_FSET(ds, msg_qbytes, qp->msg_qbytes);
509 		STRUCT_FSET(ds, msg_lspid, qp->msg_lspid);
510 		STRUCT_FSET(ds, msg_lrpid, qp->msg_lrpid);
511 		STRUCT_FSET(ds, msg_stime, qp->msg_stime);
512 		STRUCT_FSET(ds, msg_rtime, qp->msg_rtime);
513 		STRUCT_FSET(ds, msg_ctime, qp->msg_ctime);
514 		break;
515 
516 	case IPC_SET64:
517 		mutex_enter(&pp->p_lock);
518 		if ((ds64.msgx_qbytes > qp->msg_qbytes) &&
519 		    secpolicy_ipc_config(cr) != 0 &&
520 		    rctl_test(rc_process_msgmnb, pp->p_rctls, pp,
521 		    ds64.msgx_qbytes, RCA_SAFE) & RCT_DENY) {
522 			mutex_exit(&pp->p_lock);
523 			mutex_exit(lock);
524 			return (set_errno(EPERM));
525 		}
526 		mutex_exit(&pp->p_lock);
527 		if (error = ipcperm_set64(msq_svc, cr, &qp->msg_perm,
528 		    &ds64.msgx_perm)) {
529 			mutex_exit(lock);
530 			return (set_errno(error));
531 		}
532 		qp->msg_qbytes = ds64.msgx_qbytes;
533 		qp->msg_ctime = gethrestime_sec();
534 		break;
535 
536 	case IPC_STAT64:
537 		if (qp->msg_rcv_cnt)
538 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
539 		if (qp->msg_snd_cnt)
540 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
541 		ipcperm_stat64(&ds64.msgx_perm, &qp->msg_perm);
542 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
543 		ds64.msgx_cbytes = qp->msg_cbytes;
544 		ds64.msgx_qnum = qp->msg_qnum;
545 		ds64.msgx_qbytes = qp->msg_qbytes;
546 		ds64.msgx_lspid = qp->msg_lspid;
547 		ds64.msgx_lrpid = qp->msg_lrpid;
548 		ds64.msgx_stime = qp->msg_stime;
549 		ds64.msgx_rtime = qp->msg_rtime;
550 		ds64.msgx_ctime = qp->msg_ctime;
551 		break;
552 
553 	default:
554 		mutex_exit(lock);
555 		return (set_errno(EINVAL));
556 	}
557 
558 	mutex_exit(lock);
559 
560 	/*
561 	 * Do copyout last (after releasing mutex).
562 	 */
563 	switch (cmd) {
564 	case IPC_STAT:
565 		if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds)))
566 			return (set_errno(EFAULT));
567 		break;
568 
569 	case IPC_STAT64:
570 		if (copyout(&ds64, arg, sizeof (struct msqid_ds64)))
571 			return (set_errno(EFAULT));
572 		break;
573 	}
574 
575 	return (0);
576 }
577 
578 /*
579  * Remove all message queues associated with a given zone.  Called by
580  * zone_shutdown when the zone is halted.
581  */
582 /*ARGSUSED1*/
583 static void
584 msg_remove_zone(zoneid_t zoneid, void *arg)
585 {
586 	ipc_remove_zone(msq_svc, zoneid);
587 }
588 
589 /*
590  * msgget system call.
591  */
592 static int
593 msgget(key_t key, int msgflg)
594 {
595 	kmsqid_t	*qp;
596 	kmutex_t	*lock;
597 	int		id, error;
598 	int		ii;
599 	proc_t		*pp = curproc;
600 
601 top:
602 	if (error = ipc_get(msq_svc, key, msgflg, (kipc_perm_t **)&qp, &lock))
603 		return (set_errno(error));
604 
605 	if (IPC_FREE(&qp->msg_perm)) {
606 		mutex_exit(lock);
607 		mutex_exit(&pp->p_lock);
608 
609 		list_create(&qp->msg_list, sizeof (struct msg),
610 		    offsetof(struct msg, msg_node));
611 		qp->msg_qnum = 0;
612 		qp->msg_lspid = qp->msg_lrpid = 0;
613 		qp->msg_stime = qp->msg_rtime = 0;
614 		qp->msg_ctime = gethrestime_sec();
615 		qp->msg_ngt_cnt = 0;
616 		qp->msg_neg_copy = 0;
617 		for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
618 			list_create(&qp->msg_wait_snd[ii],
619 			    sizeof (msgq_wakeup_t),
620 			    offsetof(msgq_wakeup_t, msgw_list));
621 			list_create(&qp->msg_wait_snd_ngt[ii],
622 			    sizeof (msgq_wakeup_t),
623 			    offsetof(msgq_wakeup_t, msgw_list));
624 		}
625 		/*
626 		 * The proper initialization of msg_lowest_type is to the
627 		 * highest possible value.  By doing this we guarantee that
628 		 * when the first send happens, the lowest type will be set
629 		 * properly.
630 		 */
631 		qp->msg_lowest_type = MSG_SMALL_INIT;
632 		list_create(&qp->msg_cpy_block,
633 		    sizeof (msgq_wakeup_t),
634 		    offsetof(msgq_wakeup_t, msgw_list));
635 		list_create(&qp->msg_wait_rcv,
636 		    sizeof (msgq_wakeup_t),
637 		    offsetof(msgq_wakeup_t, msgw_list));
638 		qp->msg_fnd_sndr = &msg_fnd_sndr[0];
639 		qp->msg_fnd_rdr = &msg_fnd_rdr[0];
640 		qp->msg_rcv_cnt = 0;
641 		qp->msg_snd_cnt = 0;
642 		qp->msg_snd_smallest = MSG_SMALL_INIT;
643 
644 		if (error = ipc_commit_begin(msq_svc, key, msgflg,
645 		    (kipc_perm_t *)qp)) {
646 			if (error == EAGAIN)
647 				goto top;
648 			return (set_errno(error));
649 		}
650 		qp->msg_qbytes = rctl_enforced_value(rc_process_msgmnb,
651 		    pp->p_rctls, pp);
652 		qp->msg_qmax = rctl_enforced_value(rc_process_msgtql,
653 		    pp->p_rctls, pp);
654 		lock = ipc_commit_end(msq_svc, &qp->msg_perm);
655 	}
656 	if (audit_active)
657 		audit_ipcget(AT_IPC_MSG, (void *)qp);
658 	id = qp->msg_perm.ipc_id;
659 	mutex_exit(lock);
660 	return (id);
661 }
662 
663 static ssize_t
664 msgrcv(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, long msgtyp, int msgflg)
665 {
666 	struct msg	*smp;	/* ptr to best msg on q */
667 	kmsqid_t	*qp;	/* ptr to associated q */
668 	kmutex_t	*lock;
669 	size_t		xtsz;	/* transfer byte count */
670 	int		error = 0;
671 	int		cvres;
672 	uint_t		msg_hash;
673 	msgq_wakeup_t	msg_entry;
674 
675 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
676 
677 	msg_hash = msg_type_hash(msgtyp);
678 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
679 		return ((ssize_t)set_errno(EINVAL));
680 	}
681 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
682 
683 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
684 		goto msgrcv_out;
685 	}
686 
687 	/*
688 	 * Various information (including the condvar_t) required for the
689 	 * process to sleep is provided by it's stack.
690 	 */
691 	msg_entry.msgw_thrd = curthread;
692 	msg_entry.msgw_snd_wake = 0;
693 	msg_entry.msgw_type = msgtyp;
694 findmsg:
695 	smp = msgrcv_lookup(qp, msgtyp);
696 
697 	if (smp) {
698 		/*
699 		 * We found a possible message to copy out.
700 		 */
701 		if ((smp->msg_flags & MSG_RCVCOPY) == 0) {
702 			long t = msg_entry.msgw_snd_wake;
703 			long copy_type = smp->msg_type;
704 
705 			/*
706 			 * It is available, attempt to copy it.
707 			 */
708 			error = msg_copyout(qp, msgtyp, &lock, &xtsz, msgsz,
709 			    smp, msgp, msgflg);
710 
711 			/*
712 			 * It is possible to consume a different message
713 			 * type then what originally awakened for (negative
714 			 * types).  If this happens a check must be done to
715 			 * to determine if another receiver is available
716 			 * for the waking message type,  Failure to do this
717 			 * can result in a message on the queue that can be
718 			 * serviced by a sleeping receiver.
719 			 */
720 			if (!error && t && (copy_type != t))
721 				msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, t);
722 
723 			/*
724 			 * Don't forget to wakeup a sleeper that blocked because
725 			 * we were copying things out.
726 			 */
727 			msg_wakeup_rdr(qp, &qp->msg_fnd_rdr, 0);
728 			goto msgrcv_out;
729 		}
730 		/*
731 		 * The selected message is being copied out, so block.  We do
732 		 * not need to wake the next person up on the msg_cpy_block list
733 		 * due to the fact some one is copying out and they will get
734 		 * things moving again once the copy is completed.
735 		 */
736 		cvres = msg_rcvq_sleep(&qp->msg_cpy_block,
737 		    &msg_entry, &lock, qp);
738 		error = msgq_check_err(qp, cvres);
739 		if (error) {
740 			goto msgrcv_out;
741 		}
742 		goto findmsg;
743 	}
744 	/*
745 	 * There isn't a message to copy out that matches the designated
746 	 * criteria.
747 	 */
748 	if (msgflg & IPC_NOWAIT) {
749 		error = ENOMSG;
750 		goto msgrcv_out;
751 	}
752 	msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
753 
754 	/*
755 	 * Wait for new message.  We keep the negative and positive types
756 	 * separate for performance reasons.
757 	 */
758 	msg_entry.msgw_snd_wake = 0;
759 	if (msgtyp >= 0) {
760 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd[msg_hash],
761 		    &msg_entry, &lock, qp);
762 	} else {
763 		qp->msg_ngt_cnt++;
764 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd_ngt[msg_hash],
765 		    &msg_entry, &lock, qp);
766 		qp->msg_ngt_cnt--;
767 	}
768 
769 	if (!(error = msgq_check_err(qp, cvres))) {
770 		goto findmsg;
771 	}
772 
773 msgrcv_out:
774 	if (error) {
775 		msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
776 		if (msg_entry.msgw_snd_wake) {
777 			msg_wakeup_rdr(qp, &qp->msg_fnd_sndr,
778 			    msg_entry.msgw_snd_wake);
779 		}
780 		ipc_rele(msq_svc, (kipc_perm_t *)qp);
781 		return ((ssize_t)set_errno(error));
782 	}
783 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
784 	return ((ssize_t)xtsz);
785 }
786 
787 static int
788 msgq_check_err(kmsqid_t *qp, int cvres)
789 {
790 	if (IPC_FREE(&qp->msg_perm)) {
791 		return (EIDRM);
792 	}
793 
794 	if (cvres == 0) {
795 		return (EINTR);
796 	}
797 
798 	return (0);
799 }
800 
801 static int
802 msg_copyout(kmsqid_t *qp, long msgtyp, kmutex_t **lock, size_t *xtsz_ret,
803     size_t msgsz, struct msg *smp, struct ipcmsgbuf *msgp, int msgflg)
804 {
805 	size_t		xtsz;
806 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
807 	model_t		mdl = get_udatamodel();
808 	int		copyerror = 0;
809 
810 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
811 	if (msgsz < smp->msg_size) {
812 		if ((msgflg & MSG_NOERROR) == 0) {
813 			return (E2BIG);
814 		} else {
815 			xtsz = msgsz;
816 		}
817 	} else {
818 		xtsz = smp->msg_size;
819 	}
820 	*xtsz_ret = xtsz;
821 
822 	/*
823 	 * To prevent a DOS attack we mark the message as being
824 	 * copied out and release mutex.  When the copy is completed
825 	 * we need to acquire the mutex and make the appropriate updates.
826 	 */
827 	ASSERT((smp->msg_flags & MSG_RCVCOPY) == 0);
828 	smp->msg_flags |= MSG_RCVCOPY;
829 	msg_hold(smp);
830 	if (msgtyp < 0) {
831 		ASSERT(qp->msg_neg_copy == 0);
832 		qp->msg_neg_copy = 1;
833 	}
834 	mutex_exit(*lock);
835 
836 	if (mdl == DATAMODEL_NATIVE) {
837 		copyerror = copyout(&smp->msg_type, msgp,
838 		    sizeof (smp->msg_type));
839 	} else {
840 		/*
841 		 * 32-bit callers need an imploded msg type.
842 		 */
843 		int32_t	msg_type32 = smp->msg_type;
844 
845 		copyerror = copyout(&msg_type32, msgp,
846 		    sizeof (msg_type32));
847 	}
848 
849 	if (copyerror == 0 && xtsz) {
850 		copyerror = copyout(smp->msg_addr,
851 		    STRUCT_FADDR(umsgp, mtext), xtsz);
852 	}
853 
854 	/*
855 	 * Reclaim the mutex and make sure the message queue still exists.
856 	 */
857 
858 	*lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
859 	if (msgtyp < 0) {
860 		qp->msg_neg_copy = 0;
861 	}
862 	ASSERT(smp->msg_flags & MSG_RCVCOPY);
863 	smp->msg_flags &= ~MSG_RCVCOPY;
864 	msg_rele(smp);
865 	if (IPC_FREE(&qp->msg_perm)) {
866 		return (EIDRM);
867 	}
868 	if (copyerror) {
869 		return (EFAULT);
870 	}
871 	qp->msg_lrpid = ttoproc(curthread)->p_pid;
872 	qp->msg_rtime = gethrestime_sec();
873 	msgunlink(qp, smp);
874 	return (0);
875 }
876 
877 static struct msg *
878 msgrcv_lookup(kmsqid_t *qp, long msgtyp)
879 {
880 	struct msg 		*smp = NULL;
881 	long			qp_low;
882 	struct msg		*mp;	/* ptr to msg on q */
883 	long			low_msgtype;
884 	static struct msg	neg_copy_smp;
885 
886 	mp = list_head(&qp->msg_list);
887 	if (msgtyp == 0) {
888 		smp = mp;
889 	} else {
890 		qp_low = qp->msg_lowest_type;
891 		if (msgtyp > 0) {
892 			/*
893 			 * If our lowest possible message type is larger than
894 			 * the message type desired, then we know there is
895 			 * no entry present.
896 			 */
897 			if (qp_low > msgtyp) {
898 				return (NULL);
899 			}
900 
901 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
902 				if (msgtyp == mp->msg_type) {
903 					smp = mp;
904 					break;
905 				}
906 			}
907 		} else {
908 			/*
909 			 * We have kept track of the lowest possible message
910 			 * type on the send queue.  This allows us to terminate
911 			 * the search early if we find a message type of that
912 			 * type.  Note, the lowest type may not be the actual
913 			 * lowest value in the system, it is only guaranteed
914 			 * that there isn't a value lower than that.
915 			 */
916 			low_msgtype = -msgtyp;
917 			if (low_msgtype < qp_low) {
918 				return (NULL);
919 			}
920 			if (qp->msg_neg_copy) {
921 				neg_copy_smp.msg_flags = MSG_RCVCOPY;
922 				return (&neg_copy_smp);
923 			}
924 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
925 				if (mp->msg_type <= low_msgtype &&
926 				    !(smp && smp->msg_type <= mp->msg_type)) {
927 					smp = mp;
928 					low_msgtype = mp->msg_type;
929 					if (low_msgtype == qp_low) {
930 						break;
931 					}
932 				}
933 			}
934 			if (smp) {
935 				/*
936 				 * Update the lowest message type.
937 				 */
938 				qp->msg_lowest_type = smp->msg_type;
939 			}
940 		}
941 	}
942 	return (smp);
943 }
944 
945 /*
946  * msgids system call.
947  */
948 static int
949 msgids(int *buf, uint_t nids, uint_t *pnids)
950 {
951 	int error;
952 
953 	if (error = ipc_ids(msq_svc, buf, nids, pnids))
954 		return (set_errno(error));
955 
956 	return (0);
957 }
958 
959 #define	RND(x)		roundup((x), sizeof (size_t))
960 #define	RND32(x)	roundup((x), sizeof (size32_t))
961 
962 /*
963  * msgsnap system call.
964  */
965 static int
966 msgsnap(int msqid, caddr_t buf, size_t bufsz, long msgtyp)
967 {
968 	struct msg	*mp;	/* ptr to msg on q */
969 	kmsqid_t	*qp;	/* ptr to associated q */
970 	kmutex_t	*lock;
971 	size_t		size;
972 	size_t		nmsg;
973 	struct msg	**snaplist;
974 	int		error, i;
975 	model_t		mdl = get_udatamodel();
976 	STRUCT_DECL(msgsnap_head, head);
977 	STRUCT_DECL(msgsnap_mhead, mhead);
978 
979 	STRUCT_INIT(head, mdl);
980 	STRUCT_INIT(mhead, mdl);
981 
982 	if (bufsz < STRUCT_SIZE(head))
983 		return (set_errno(EINVAL));
984 
985 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL)
986 		return (set_errno(EINVAL));
987 
988 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
989 		mutex_exit(lock);
990 		return (set_errno(error));
991 	}
992 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
993 
994 	/*
995 	 * First compute the required buffer size and
996 	 * the number of messages on the queue.
997 	 */
998 	size = nmsg = 0;
999 	for (mp = list_head(&qp->msg_list); mp;
1000 	    mp = list_next(&qp->msg_list, mp)) {
1001 		if (msgtyp == 0 ||
1002 		    (msgtyp > 0 && msgtyp == mp->msg_type) ||
1003 		    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
1004 			nmsg++;
1005 			if (mdl == DATAMODEL_NATIVE)
1006 				size += RND(mp->msg_size);
1007 			else
1008 				size += RND32(mp->msg_size);
1009 		}
1010 	}
1011 
1012 	size += STRUCT_SIZE(head) + nmsg * STRUCT_SIZE(mhead);
1013 	if (size > bufsz)
1014 		nmsg = 0;
1015 
1016 	if (nmsg > 0) {
1017 		/*
1018 		 * Mark the messages as being copied.
1019 		 */
1020 		snaplist = (struct msg **)kmem_alloc(nmsg *
1021 		    sizeof (struct msg *), KM_SLEEP);
1022 		i = 0;
1023 		for (mp = list_head(&qp->msg_list); mp;
1024 		    mp = list_next(&qp->msg_list, mp)) {
1025 			if (msgtyp == 0 ||
1026 			    (msgtyp > 0 && msgtyp == mp->msg_type) ||
1027 			    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
1028 				msg_hold(mp);
1029 				snaplist[i] = mp;
1030 				i++;
1031 			}
1032 		}
1033 	}
1034 	mutex_exit(lock);
1035 
1036 	/*
1037 	 * Copy out the buffer header.
1038 	 */
1039 	STRUCT_FSET(head, msgsnap_size, size);
1040 	STRUCT_FSET(head, msgsnap_nmsg, nmsg);
1041 	if (copyout(STRUCT_BUF(head), buf, STRUCT_SIZE(head)))
1042 		error = EFAULT;
1043 
1044 	buf += STRUCT_SIZE(head);
1045 
1046 	/*
1047 	 * Now copy out the messages one by one.
1048 	 */
1049 	for (i = 0; i < nmsg; i++) {
1050 		mp = snaplist[i];
1051 		if (error == 0) {
1052 			STRUCT_FSET(mhead, msgsnap_mlen, mp->msg_size);
1053 			STRUCT_FSET(mhead, msgsnap_mtype, mp->msg_type);
1054 			if (copyout(STRUCT_BUF(mhead), buf, STRUCT_SIZE(mhead)))
1055 				error = EFAULT;
1056 			buf += STRUCT_SIZE(mhead);
1057 
1058 			if (error == 0 &&
1059 			    mp->msg_size != 0 &&
1060 			    copyout(mp->msg_addr, buf, mp->msg_size))
1061 				error = EFAULT;
1062 			if (mdl == DATAMODEL_NATIVE)
1063 				buf += RND(mp->msg_size);
1064 			else
1065 				buf += RND32(mp->msg_size);
1066 		}
1067 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
1068 		msg_rele(mp);
1069 		/* Check for msg q deleted or reallocated */
1070 		if (IPC_FREE(&qp->msg_perm))
1071 			error = EIDRM;
1072 		mutex_exit(lock);
1073 	}
1074 
1075 	(void) ipc_lock(msq_svc, qp->msg_perm.ipc_id);
1076 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
1077 
1078 	if (nmsg > 0)
1079 		kmem_free(snaplist, nmsg * sizeof (struct msg *));
1080 
1081 	if (error)
1082 		return (set_errno(error));
1083 	return (0);
1084 }
1085 
1086 #define	MSG_PREALLOC_LIMIT 8192
1087 
1088 /*
1089  * msgsnd system call.
1090  */
1091 static int
1092 msgsnd(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, int msgflg)
1093 {
1094 	kmsqid_t	*qp;
1095 	kmutex_t	*lock = NULL;
1096 	struct msg	*mp = NULL;
1097 	long		type;
1098 	int		error = 0, wait_wakeup = 0;
1099 	msgq_wakeup_t   msg_entry;
1100 	model_t		mdl = get_udatamodel();
1101 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
1102 
1103 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
1104 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
1105 
1106 	if (mdl == DATAMODEL_NATIVE) {
1107 		if (copyin(msgp, &type, sizeof (type)))
1108 			return (set_errno(EFAULT));
1109 	} else {
1110 		int32_t	type32;
1111 		if (copyin(msgp, &type32, sizeof (type32)))
1112 			return (set_errno(EFAULT));
1113 		type = type32;
1114 	}
1115 
1116 	if (type < 1)
1117 		return (set_errno(EINVAL));
1118 
1119 	/*
1120 	 * We want the value here large enough that most of the
1121 	 * the message operations will use the "lockless" path,
1122 	 * but small enough that a user can not reserve large
1123 	 * chunks of kernel memory unless they have a valid
1124 	 * reason to.
1125 	 */
1126 	if (msgsz <= MSG_PREALLOC_LIMIT) {
1127 		/*
1128 		 * We are small enough that we can afford to do the
1129 		 * allocation now.  This saves dropping the lock
1130 		 * and then reacquiring the lock.
1131 		 */
1132 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
1133 		mp->msg_copycnt = 1;
1134 		mp->msg_size = msgsz;
1135 		if (msgsz) {
1136 			mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
1137 			if (copyin(STRUCT_FADDR(umsgp, mtext),
1138 			    mp->msg_addr, msgsz) == -1) {
1139 				error = EFAULT;
1140 				goto msgsnd_out;
1141 			}
1142 		}
1143 	}
1144 
1145 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
1146 		error = EINVAL;
1147 		goto msgsnd_out;
1148 	}
1149 
1150 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
1151 
1152 	if (msgsz > qp->msg_qbytes) {
1153 		error = EINVAL;
1154 		goto msgsnd_out;
1155 	}
1156 
1157 	if (error = ipcperm_access(&qp->msg_perm, MSG_W, CRED()))
1158 		goto msgsnd_out;
1159 
1160 top:
1161 	/*
1162 	 * Allocate space on q, message header, & buffer space.
1163 	 */
1164 	ASSERT(qp->msg_qnum <= qp->msg_qmax);
1165 	while ((msgsz > qp->msg_qbytes - qp->msg_cbytes) ||
1166 	    (qp->msg_qnum == qp->msg_qmax)) {
1167 		int cvres;
1168 
1169 		if (msgflg & IPC_NOWAIT) {
1170 			error = EAGAIN;
1171 			goto msgsnd_out;
1172 		}
1173 
1174 		wait_wakeup = 0;
1175 		qp->msg_snd_cnt++;
1176 		msg_entry.msgw_snd_size = msgsz;
1177 		msg_entry.msgw_thrd = curthread;
1178 		msg_entry.msgw_type = type;
1179 		cv_init(&msg_entry.msgw_wake_cv, NULL, 0, NULL);
1180 		list_insert_tail(&qp->msg_wait_rcv, &msg_entry);
1181 		if (qp->msg_snd_smallest > msgsz)
1182 			qp->msg_snd_smallest = msgsz;
1183 		cvres = cv_wait_sig(&msg_entry.msgw_wake_cv, lock);
1184 		lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, lock);
1185 		qp->msg_snd_cnt--;
1186 		if (list_link_active(&msg_entry.msgw_list))
1187 			list_remove(&qp->msg_wait_rcv, &msg_entry);
1188 		if (error = msgq_check_err(qp, cvres)) {
1189 			goto msgsnd_out;
1190 		}
1191 		wait_wakeup = 1;
1192 	}
1193 
1194 	if (mp == NULL) {
1195 		int failure;
1196 
1197 		mutex_exit(lock);
1198 		ASSERT(msgsz > 0);
1199 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
1200 		mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
1201 		mp->msg_size = msgsz;
1202 		mp->msg_copycnt = 1;
1203 
1204 		failure = (copyin(STRUCT_FADDR(umsgp, mtext),
1205 		    mp->msg_addr, msgsz) == -1);
1206 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
1207 		if (IPC_FREE(&qp->msg_perm)) {
1208 			error = EIDRM;
1209 			goto msgsnd_out;
1210 		}
1211 		if (failure) {
1212 			error = EFAULT;
1213 			goto msgsnd_out;
1214 		}
1215 		goto top;
1216 	}
1217 
1218 	/*
1219 	 * Everything is available, put msg on q.
1220 	 */
1221 	qp->msg_qnum++;
1222 	qp->msg_cbytes += msgsz;
1223 	qp->msg_lspid = curproc->p_pid;
1224 	qp->msg_stime = gethrestime_sec();
1225 	mp->msg_type = type;
1226 	if (qp->msg_lowest_type > type)
1227 		qp->msg_lowest_type = type;
1228 	list_insert_tail(&qp->msg_list, mp);
1229 	/*
1230 	 * Get the proper receiver going.
1231 	 */
1232 	msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, type);
1233 
1234 msgsnd_out:
1235 	/*
1236 	 * We were woken up from the send wait list, but an
1237 	 * an error occured on placing the message onto the
1238 	 * msg queue.  Given that, we need to do the wakeup
1239 	 * dance again.
1240 	 */
1241 
1242 	if (wait_wakeup && error) {
1243 		msg_wakeup_senders(qp);
1244 	}
1245 	if (lock)
1246 		ipc_rele(msq_svc, (kipc_perm_t *)qp);	/* drops lock */
1247 
1248 	if (error) {
1249 		if (mp)
1250 			msg_rele(mp);
1251 		return (set_errno(error));
1252 	}
1253 
1254 	return (0);
1255 }
1256 
1257 static void
1258 msg_wakeup_rdr(kmsqid_t *qp, msg_select_t **flist, long type)
1259 {
1260 	msg_select_t	*walker = *flist;
1261 	msgq_wakeup_t	*wakeup;
1262 	uint_t		msg_hash;
1263 
1264 	msg_hash = msg_type_hash(type);
1265 
1266 	do {
1267 		wakeup = walker->selection(qp, msg_hash, type);
1268 		walker = walker->next_selection;
1269 	} while (!wakeup && walker != *flist);
1270 
1271 	*flist = (*flist)->next_selection;
1272 	if (wakeup) {
1273 		if (type) {
1274 			wakeup->msgw_snd_wake = type;
1275 		}
1276 		cv_signal(&wakeup->msgw_wake_cv);
1277 	}
1278 }
1279 
1280 static uint_t
1281 msg_type_hash(long msg_type)
1282 {
1283 	if (msg_type < 0) {
1284 		long	hash = -msg_type / MSG_NEG_INTERVAL;
1285 		/*
1286 		 * Negative message types are hashed over an
1287 		 * interval.  Any message type that hashes
1288 		 * beyond MSG_MAX_QNUM is automatically placed
1289 		 * in the last bucket.
1290 		 */
1291 		if (hash > MSG_MAX_QNUM)
1292 			hash = MSG_MAX_QNUM;
1293 		return (hash);
1294 	}
1295 
1296 	/*
1297 	 * 0 or positive message type.  The first bucket is reserved for
1298 	 * message receivers of type 0, the other buckets we hash into.
1299 	 */
1300 	if (msg_type)
1301 		return (1 + (msg_type % MSG_MAX_QNUM));
1302 	return (0);
1303 }
1304 
1305 /*
1306  * Routines to see if we have a receiver of type 0 either blocked waiting
1307  * for a message.  Simply return the first guy on the list.
1308  */
1309 
1310 static msgq_wakeup_t *
1311 /* ARGSUSED */
1312 msg_fnd_any_snd(kmsqid_t *qp, int msg_hash, long type)
1313 {
1314 	msgq_wakeup_t	*walker;
1315 
1316 	walker = list_head(&qp->msg_wait_snd[0]);
1317 
1318 	if (walker)
1319 		list_remove(&qp->msg_wait_snd[0], walker);
1320 	return (walker);
1321 }
1322 
1323 static msgq_wakeup_t *
1324 /* ARGSUSED */
1325 msg_fnd_any_rdr(kmsqid_t *qp, int msg_hash, long type)
1326 {
1327 	msgq_wakeup_t	*walker;
1328 
1329 	walker = list_head(&qp->msg_cpy_block);
1330 	if (walker)
1331 		list_remove(&qp->msg_cpy_block, walker);
1332 	return (walker);
1333 }
1334 
1335 static msgq_wakeup_t *
1336 msg_fnd_spc_snd(kmsqid_t *qp, int msg_hash, long type)
1337 {
1338 	msgq_wakeup_t	*walker;
1339 
1340 	walker = list_head(&qp->msg_wait_snd[msg_hash]);
1341 
1342 	while (walker && walker->msgw_type != type)
1343 		walker = list_next(&qp->msg_wait_snd[msg_hash], walker);
1344 	if (walker)
1345 		list_remove(&qp->msg_wait_snd[msg_hash], walker);
1346 	return (walker);
1347 }
1348 
1349 /* ARGSUSED */
1350 static msgq_wakeup_t *
1351 msg_fnd_neg_snd(kmsqid_t *qp, int msg_hash, long type)
1352 {
1353 	msgq_wakeup_t	*qptr;
1354 	int		count;
1355 	int		check_index;
1356 	int		neg_index;
1357 	int		nbuckets;
1358 
1359 	if (!qp->msg_ngt_cnt) {
1360 		return (NULL);
1361 	}
1362 	neg_index = msg_type_hash(-type);
1363 
1364 	/*
1365 	 * Check for a match among the negative type queues.  Any buckets
1366 	 * at neg_index or larger can match the type.  Use the last send
1367 	 * time to randomize the starting bucket to prevent starvation.
1368 	 * Search all buckets from neg_index to MSG_MAX_QNUM, starting
1369 	 * from the random starting point, and wrapping around after
1370 	 * MSG_MAX_QNUM.
1371 	 */
1372 
1373 	nbuckets = MSG_MAX_QNUM - neg_index + 1;
1374 	check_index = neg_index + (qp->msg_stime % nbuckets);
1375 
1376 	for (count = nbuckets; count > 0; count--) {
1377 		qptr = list_head(&qp->msg_wait_snd_ngt[check_index]);
1378 		while (qptr) {
1379 			/*
1380 			 * The lowest hash bucket may actually contain
1381 			 * message types that are not valid for this
1382 			 * request.  This can happen due to the fact that
1383 			 * the message buckets actually contain a consecutive
1384 			 * range of types.
1385 			 */
1386 			if (-qptr->msgw_type >= type) {
1387 				list_remove(&qp->msg_wait_snd_ngt[check_index],
1388 				    qptr);
1389 				return (qptr);
1390 			}
1391 			qptr = list_next(&qp->msg_wait_snd_ngt[check_index],
1392 			    qptr);
1393 		}
1394 		if (++check_index > MSG_MAX_QNUM) {
1395 			check_index = neg_index;
1396 		}
1397 	}
1398 	return (NULL);
1399 }
1400 
1401 static int
1402 msg_rcvq_sleep(list_t *queue, msgq_wakeup_t *entry, kmutex_t **lock,
1403     kmsqid_t *qp)
1404 {
1405 	int		cvres;
1406 
1407 	cv_init(&entry->msgw_wake_cv, NULL, 0, NULL);
1408 
1409 	list_insert_tail(queue, entry);
1410 
1411 	qp->msg_rcv_cnt++;
1412 	cvres = cv_wait_sig(&entry->msgw_wake_cv, *lock);
1413 	*lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, *lock);
1414 	qp->msg_rcv_cnt--;
1415 
1416 	if (list_link_active(&entry->msgw_list)) {
1417 		/*
1418 		 * We woke up unexpectedly, remove ourself.
1419 		 */
1420 		list_remove(queue, entry);
1421 	}
1422 
1423 	return (cvres);
1424 }
1425 
1426 static void
1427 msg_rcvq_wakeup_all(list_t *q_ptr)
1428 {
1429 	msgq_wakeup_t	*q_walk;
1430 
1431 	while (q_walk = list_head(q_ptr)) {
1432 		list_remove(q_ptr, q_walk);
1433 		cv_signal(&q_walk->msgw_wake_cv);
1434 	}
1435 }
1436 
1437 /*
1438  * msgsys - System entry point for msgctl, msgget, msgrcv, and msgsnd
1439  * system calls.
1440  */
1441 static ssize_t
1442 msgsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3,
1443 	uintptr_t a4, uintptr_t a5)
1444 {
1445 	ssize_t error;
1446 
1447 	switch (opcode) {
1448 	case MSGGET:
1449 		error = msgget((key_t)a1, (int)a2);
1450 		break;
1451 	case MSGCTL:
1452 		error = msgctl((int)a1, (int)a2, (void *)a3);
1453 		break;
1454 	case MSGRCV:
1455 		error = msgrcv((int)a1, (struct ipcmsgbuf *)a2,
1456 		    (size_t)a3, (long)a4, (int)a5);
1457 		break;
1458 	case MSGSND:
1459 		error = msgsnd((int)a1, (struct ipcmsgbuf *)a2,
1460 		    (size_t)a3, (int)a4);
1461 		break;
1462 	case MSGIDS:
1463 		error = msgids((int *)a1, (uint_t)a2, (uint_t *)a3);
1464 		break;
1465 	case MSGSNAP:
1466 		error = msgsnap((int)a1, (caddr_t)a2, (size_t)a3, (long)a4);
1467 		break;
1468 	default:
1469 		error = set_errno(EINVAL);
1470 		break;
1471 	}
1472 
1473 	return (error);
1474 }
1475 
1476 /*
1477  * Determine if a writer who is waiting can process its message.  If so
1478  * wake it up.
1479  */
1480 static void
1481 msg_wakeup_senders(kmsqid_t *qp)
1482 
1483 {
1484 	struct msgq_wakeup *ptr, *optr;
1485 	size_t avail, smallest;
1486 	int msgs_out;
1487 
1488 	/*
1489 	 * Is there a writer waiting, and if so, can it be serviced? If
1490 	 * not return back to the caller.
1491 	 */
1492 	if (IPC_FREE(&qp->msg_perm) || qp->msg_qnum >= qp->msg_qmax)
1493 		return;
1494 
1495 	avail = qp->msg_qbytes - qp->msg_cbytes;
1496 	if (avail < qp->msg_snd_smallest)
1497 		return;
1498 
1499 	ptr = list_head(&qp->msg_wait_rcv);
1500 	if (ptr == NULL) {
1501 		qp->msg_snd_smallest = MSG_SMALL_INIT;
1502 		return;
1503 	}
1504 	optr = ptr;
1505 
1506 	/*
1507 	 * smallest:	minimum message size of all queued writers
1508 	 *
1509 	 * avail:	amount of space left on the msgq
1510 	 *		if all the writers we have woken up are successful.
1511 	 *
1512 	 * msgs_out:	is the number of messages on the message queue if
1513 	 *		all the writers we have woken up are successful.
1514 	 */
1515 
1516 	smallest = MSG_SMALL_INIT;
1517 	msgs_out = qp->msg_qnum;
1518 	while (ptr) {
1519 		ptr = list_next(&qp->msg_wait_rcv, ptr);
1520 		if (optr->msgw_snd_size <= avail) {
1521 			list_remove(&qp->msg_wait_rcv, optr);
1522 			avail -= optr->msgw_snd_size;
1523 			cv_signal(&optr->msgw_wake_cv);
1524 			msgs_out++;
1525 			if (msgs_out == qp->msg_qmax ||
1526 			    avail < qp->msg_snd_smallest)
1527 				break;
1528 		} else {
1529 			if (smallest > optr->msgw_snd_size)
1530 				smallest = optr->msgw_snd_size;
1531 		}
1532 		optr = ptr;
1533 	}
1534 
1535 	/*
1536 	 * Reset the smallest message size if the entire list has been visited
1537 	 */
1538 	if (ptr == NULL && smallest != MSG_SMALL_INIT)
1539 		qp->msg_snd_smallest = smallest;
1540 }
1541 
1542 #ifdef	_SYSCALL32_IMPL
1543 /*
1544  * msgsys32 - System entry point for msgctl, msgget, msgrcv, and msgsnd
1545  * system calls for 32-bit callers on LP64 kernel.
1546  */
1547 static ssize32_t
1548 msgsys32(int opcode, uint32_t a1, uint32_t a2, uint32_t a3,
1549 	uint32_t a4, uint32_t a5)
1550 {
1551 	ssize_t error;
1552 
1553 	switch (opcode) {
1554 	case MSGGET:
1555 		error = msgget((key_t)a1, (int)a2);
1556 		break;
1557 	case MSGCTL:
1558 		error = msgctl((int)a1, (int)a2, (void *)(uintptr_t)a3);
1559 		break;
1560 	case MSGRCV:
1561 		error = msgrcv((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
1562 		    (size_t)a3, (long)(int32_t)a4, (int)a5);
1563 		break;
1564 	case MSGSND:
1565 		error = msgsnd((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
1566 		    (size_t)(int32_t)a3, (int)a4);
1567 		break;
1568 	case MSGIDS:
1569 		error = msgids((int *)(uintptr_t)a1, (uint_t)a2,
1570 		    (uint_t *)(uintptr_t)a3);
1571 		break;
1572 	case MSGSNAP:
1573 		error = msgsnap((int)a1, (caddr_t)(uintptr_t)a2, (size_t)a3,
1574 		    (long)(int32_t)a4);
1575 		break;
1576 	default:
1577 		error = set_errno(EINVAL);
1578 		break;
1579 	}
1580 
1581 	return (error);
1582 }
1583 #endif	/* SYSCALL32_IMPL */
1584