xref: /titanic_51/usr/src/uts/common/fs/sockfs/sockcommon_subr.c (revision 54d5ddcceae506b00e8889ad38c9d15489f670c5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/signal.h>
30 #include <sys/cmn_err.h>
31 
32 #include <sys/stropts.h>
33 #include <sys/socket.h>
34 #include <sys/socketvar.h>
35 #include <sys/sockio.h>
36 #include <sys/sodirect.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/atomic.h>
40 
41 #include <fs/sockfs/sockcommon.h>
42 #include <fs/sockfs/socktpi.h>
43 #include <sys/ddi.h>
44 #include <inet/ip.h>
45 #include <sys/time.h>
46 #include <sys/cmn_err.h>
47 
48 #ifdef SOCK_TEST
49 extern int do_useracc;
50 extern clock_t sock_test_timelimit;
51 #endif /* SOCK_TEST */
52 
53 #define	MBLK_PULL_LEN 64
54 uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
55 
56 #ifdef DEBUG
57 boolean_t so_debug_length = B_FALSE;
58 static boolean_t so_check_length(sonode_t *so);
59 #endif
60 
61 int
62 so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
63 {
64 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
65 	ASSERT(nso->so_acceptq_next == NULL);
66 
67 	*so->so_acceptq_tail = nso;
68 	so->so_acceptq_tail = &nso->so_acceptq_next;
69 	so->so_acceptq_len++;
70 
71 	/* Wakeup a single consumer */
72 	cv_signal(&so->so_acceptq_cv);
73 
74 	return (so->so_acceptq_len);
75 }
76 
77 /*
78  * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
79  *
80  * Enqueue an incoming connection on a listening socket.
81  *
82  * Arguments:
83  *   so	  - listening socket
84  *   nso  - new connection
85  *
86  * Returns:
87  *   Number of queued connections, including the new connection
88  */
89 int
90 so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
91 {
92 	int conns;
93 
94 	mutex_enter(&so->so_acceptq_lock);
95 	conns = so_acceptq_enqueue_locked(so, nso);
96 	mutex_exit(&so->so_acceptq_lock);
97 
98 	return (conns);
99 }
100 
101 static int
102 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
103     struct sonode **nsop)
104 {
105 	struct sonode *nso = NULL;
106 
107 	*nsop = NULL;
108 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
109 	while ((nso = so->so_acceptq_head) == NULL) {
110 		/*
111 		 * No need to check so_error here, because it is not
112 		 * possible for a listening socket to be reset or otherwise
113 		 * disconnected.
114 		 *
115 		 * So now we just need check if it's ok to wait.
116 		 */
117 		if (dontblock)
118 			return (EWOULDBLOCK);
119 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
120 			return (EINTR);
121 
122 		if (cv_wait_sig_swap(&so->so_acceptq_cv,
123 		    &so->so_acceptq_lock) == 0)
124 			return (EINTR);
125 	}
126 
127 	ASSERT(nso != NULL);
128 	so->so_acceptq_head = nso->so_acceptq_next;
129 	nso->so_acceptq_next = NULL;
130 
131 	if (so->so_acceptq_head == NULL) {
132 		ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
133 		so->so_acceptq_tail = &so->so_acceptq_head;
134 	}
135 	ASSERT(so->so_acceptq_len > 0);
136 	--so->so_acceptq_len;
137 
138 	*nsop = nso;
139 
140 	return (0);
141 }
142 
143 /*
144  * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
145  *
146  * Pulls a connection off of the accept queue.
147  *
148  * Arguments:
149  *   so	       - listening socket
150  *   dontblock - indicate whether it's ok to sleep if there are no
151  *		 connections on the queue
152  *   nsop      - Value-return argument
153  *
154  * Return values:
155  *   0 when a connection is successfully dequeued, in which case nsop
156  *   is set to point to the new connection. Upon failure a non-zero
157  *   value is returned, and the value of nsop is set to NULL.
158  *
159  * Note:
160  *   so_acceptq_dequeue() may return prematurly if the socket is falling
161  *   back to TPI.
162  */
163 int
164 so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
165     struct sonode **nsop)
166 {
167 	int error;
168 
169 	mutex_enter(&so->so_acceptq_lock);
170 	error = so_acceptq_dequeue_locked(so, dontblock, nsop);
171 	mutex_exit(&so->so_acceptq_lock);
172 
173 	return (error);
174 }
175 
176 /*
177  * void so_acceptq_flush(struct sonode *so)
178  *
179  * Removes all pending connections from a listening socket, and
180  * frees the associated resources.
181  *
182  * Arguments
183  *   so	    - listening socket
184  *
185  * Return values:
186  *   None.
187  *
188  * Note:
189  *   The caller has to ensure that no calls to so_acceptq_enqueue() or
190  *   so_acceptq_dequeue() occur while the accept queue is being flushed.
191  *   So either the socket needs to be in a state where no operations
192  *   would come in, or so_lock needs to be obtained.
193  */
194 void
195 so_acceptq_flush(struct sonode *so)
196 {
197 	struct sonode *nso;
198 
199 	nso = so->so_acceptq_head;
200 
201 	while (nso != NULL) {
202 		struct sonode *nnso = NULL;
203 
204 		nnso = nso->so_acceptq_next;
205 		nso->so_acceptq_next = NULL;
206 		/*
207 		 * Since the socket is on the accept queue, there can
208 		 * only be one reference. We drop the reference and
209 		 * just blow off the socket.
210 		 */
211 		ASSERT(nso->so_count == 1);
212 		nso->so_count--;
213 		socket_destroy(nso);
214 		nso = nnso;
215 	}
216 
217 	so->so_acceptq_head = NULL;
218 	so->so_acceptq_tail = &so->so_acceptq_head;
219 	so->so_acceptq_len = 0;
220 }
221 
222 int
223 so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
224     sock_connid_t id)
225 {
226 	ASSERT(MUTEX_HELD(&so->so_lock));
227 
228 	/*
229 	 * The protocol has notified us that a connection attempt is being
230 	 * made, so before we wait for a notification to arrive we must
231 	 * clear out any errors associated with earlier connection attempts.
232 	 */
233 	if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
234 		so->so_error = 0;
235 
236 	while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
237 		if (nonblock)
238 			return (EINPROGRESS);
239 
240 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
241 			return (EINTR);
242 
243 		if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
244 			return (EINTR);
245 	}
246 
247 	if (so->so_error != 0)
248 		return (sogeterr(so, B_TRUE));
249 	/*
250 	 * Under normal circumstances, so_error should contain an error
251 	 * in case the connect failed. However, it is possible for another
252 	 * thread to come in a consume the error, so generate a sensible
253 	 * error in that case.
254 	 */
255 	if ((so->so_state & SS_ISCONNECTED) == 0)
256 		return (ECONNREFUSED);
257 
258 	return (0);
259 }
260 
261 /*
262  * int so_wait_connected(struct sonode *so, boolean_t nonblock,
263  *    sock_connid_t id)
264  *
265  * Wait until the socket is connected or an error has occured.
266  *
267  * Arguments:
268  *   so	      - socket
269  *   nonblock - indicate whether it's ok to sleep if the connection has
270  *		not yet been established
271  *   gen      - generation number that was returned by the protocol
272  *		when the operation was started
273  *
274  * Returns:
275  *   0 if the connection attempt was successful, or an error indicating why
276  *   the connection attempt failed.
277  */
278 int
279 so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
280 {
281 	int error;
282 
283 	mutex_enter(&so->so_lock);
284 	error = so_wait_connected_locked(so, nonblock, id);
285 	mutex_exit(&so->so_lock);
286 
287 	return (error);
288 }
289 
290 int
291 so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
292 {
293 	int error;
294 
295 	ASSERT(MUTEX_HELD(&so->so_lock));
296 	while (so->so_snd_qfull) {
297 		if (so->so_state & SS_CANTSENDMORE)
298 			return (EPIPE);
299 		if (dontblock)
300 			return (EWOULDBLOCK);
301 
302 		if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
303 			return (EINTR);
304 
305 		if (so->so_sndtimeo == 0) {
306 			/*
307 			 * Zero means disable timeout.
308 			 */
309 			error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
310 		} else {
311 			clock_t now;
312 
313 			time_to_wait(&now, so->so_sndtimeo);
314 			error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
315 			    now);
316 		}
317 		if (error == 0)
318 			return (EINTR);
319 		else if (error == -1)
320 			return (EAGAIN);
321 	}
322 	return (0);
323 }
324 
325 /*
326  * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
327  *
328  * Wait for the transport to notify us about send buffers becoming
329  * available.
330  */
331 int
332 so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
333 {
334 	int error = 0;
335 
336 	mutex_enter(&so->so_lock);
337 	if (so->so_snd_qfull) {
338 		so->so_snd_wakeup = B_TRUE;
339 		error = so_snd_wait_qnotfull_locked(so, dontblock);
340 		so->so_snd_wakeup = B_FALSE;
341 	}
342 	mutex_exit(&so->so_lock);
343 
344 	return (error);
345 }
346 
347 void
348 so_snd_qfull(struct sonode *so)
349 {
350 	mutex_enter(&so->so_lock);
351 	so->so_snd_qfull = B_TRUE;
352 	mutex_exit(&so->so_lock);
353 }
354 
355 void
356 so_snd_qnotfull(struct sonode *so)
357 {
358 	mutex_enter(&so->so_lock);
359 	so->so_snd_qfull = B_FALSE;
360 	/* wake up everyone waiting for buffers */
361 	cv_broadcast(&so->so_snd_cv);
362 	mutex_exit(&so->so_lock);
363 }
364 
365 /*
366  * Change the process/process group to which SIGIO is sent.
367  */
368 int
369 socket_chgpgrp(struct sonode *so, pid_t pid)
370 {
371 	int error;
372 
373 	ASSERT(MUTEX_HELD(&so->so_lock));
374 	if (pid != 0) {
375 		/*
376 		 * Permissions check by sending signal 0.
377 		 * Note that when kill fails it does a
378 		 * set_errno causing the system call to fail.
379 		 */
380 		error = kill(pid, 0);
381 		if (error != 0) {
382 			return (error);
383 		}
384 	}
385 	so->so_pgrp = pid;
386 	return (0);
387 }
388 
389 
390 /*
391  * Generate a SIGIO, for 'writable' events include siginfo structure,
392  * for read events just send the signal.
393  */
394 /*ARGSUSED*/
395 static void
396 socket_sigproc(proc_t *proc, int event)
397 {
398 	k_siginfo_t info;
399 
400 	ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
401 
402 	if (event & SOCKETSIG_WRITE) {
403 		info.si_signo = SIGPOLL;
404 		info.si_code = POLL_OUT;
405 		info.si_errno = 0;
406 		info.si_fd = 0;
407 		info.si_band = 0;
408 		sigaddq(proc, NULL, &info, KM_NOSLEEP);
409 	}
410 	if (event & SOCKETSIG_READ) {
411 		sigtoproc(proc, NULL, SIGPOLL);
412 	}
413 	if (event & SOCKETSIG_URG) {
414 		sigtoproc(proc, NULL, SIGURG);
415 	}
416 }
417 
418 void
419 socket_sendsig(struct sonode *so, int event)
420 {
421 	proc_t *proc;
422 
423 	ASSERT(MUTEX_HELD(&so->so_lock));
424 
425 	if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
426 	    event != SOCKETSIG_URG)) {
427 		return;
428 	}
429 
430 	dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
431 
432 	if (so->so_pgrp > 0) {
433 		/*
434 		 * XXX This unfortunately still generates
435 		 * a signal when a fd is closed but
436 		 * the proc is active.
437 		 */
438 		mutex_enter(&pidlock);
439 		proc = prfind(so->so_pgrp);
440 		if (proc == NULL) {
441 			mutex_exit(&pidlock);
442 			return;
443 		}
444 		mutex_enter(&proc->p_lock);
445 		mutex_exit(&pidlock);
446 		socket_sigproc(proc, event);
447 		mutex_exit(&proc->p_lock);
448 	} else {
449 		/*
450 		 * Send to process group. Hold pidlock across
451 		 * calls to socket_sigproc().
452 		 */
453 		pid_t pgrp = -so->so_pgrp;
454 
455 		mutex_enter(&pidlock);
456 		proc = pgfind(pgrp);
457 		while (proc != NULL) {
458 			mutex_enter(&proc->p_lock);
459 			socket_sigproc(proc, event);
460 			mutex_exit(&proc->p_lock);
461 			proc = proc->p_pglink;
462 		}
463 		mutex_exit(&pidlock);
464 	}
465 }
466 
467 #define	MIN(a, b) ((a) < (b) ? (a) : (b))
468 /* Copy userdata into a new mblk_t */
469 mblk_t *
470 socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
471     size_t tail_len, int *errorp, cred_t *cr)
472 {
473 	mblk_t	*head = NULL, **tail = &head;
474 
475 	ASSERT(iosize == INFPSZ || iosize > 0);
476 
477 	if (iosize == INFPSZ || iosize > uiop->uio_resid)
478 		iosize = uiop->uio_resid;
479 
480 	if (maxblk == INFPSZ)
481 		maxblk = iosize;
482 
483 	/* Nothing to do in these cases, so we're done */
484 	if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
485 		goto done;
486 
487 	/*
488 	 * We will enter the loop below if iosize is 0; it will allocate an
489 	 * empty message block and call uiomove(9F) which will just return.
490 	 * We could avoid that with an extra check but would only slow
491 	 * down the much more likely case where iosize is larger than 0.
492 	 */
493 	do {
494 		ssize_t blocksize;
495 		mblk_t	*mp;
496 
497 		blocksize = MIN(iosize, maxblk);
498 		ASSERT(blocksize >= 0);
499 		if (is_system_labeled())
500 			mp = allocb_cred(wroff + blocksize + tail_len,
501 			    cr, curproc->p_pid);
502 		else
503 			mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
504 		if (mp == NULL) {
505 			*errorp = ENOMEM;
506 			return (head);
507 		}
508 		mp->b_rptr += wroff;
509 		mp->b_wptr = mp->b_rptr + blocksize;
510 
511 		*tail = mp;
512 		tail = &mp->b_cont;
513 
514 		/* uiomove(9F) either returns 0 or EFAULT */
515 		if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
516 		    UIO_WRITE, uiop)) != 0) {
517 			ASSERT(*errorp != ENOMEM);
518 			freemsg(head);
519 			return (NULL);
520 		}
521 
522 		iosize -= blocksize;
523 	} while (iosize > 0);
524 
525 done:
526 	*errorp = 0;
527 	return (head);
528 }
529 
530 mblk_t *
531 socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
532 {
533 	int error;
534 	ptrdiff_t n;
535 	mblk_t *nmp;
536 
537 	ASSERT(mp->b_wptr >= mp->b_rptr);
538 
539 	/*
540 	 * max_read is the offset of the oobmark and read can not go pass
541 	 * the oobmark.
542 	 */
543 	if (max_read == INFPSZ || max_read > uiop->uio_resid)
544 		max_read = uiop->uio_resid;
545 
546 	do {
547 		if ((n = MIN(max_read, MBLKL(mp))) != 0) {
548 			ASSERT(n > 0);
549 
550 			error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
551 			if (error != 0) {
552 				freemsg(mp);
553 				*errorp = error;
554 				return (NULL);
555 			}
556 		}
557 
558 		mp->b_rptr += n;
559 		max_read -= n;
560 		while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
561 			/*
562 			 * get rid of zero length mblks
563 			 */
564 			nmp = mp;
565 			mp = mp->b_cont;
566 			freeb(nmp);
567 		}
568 	} while (mp != NULL && max_read > 0);
569 
570 	*errorp = 0;
571 	return (mp);
572 }
573 
574 static void
575 so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
576 {
577 	ASSERT(last_tail != NULL);
578 	mp->b_next = so->so_rcv_q_head;
579 	mp->b_prev = last_tail;
580 	ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
581 
582 	if (so->so_rcv_q_head == NULL) {
583 		ASSERT(so->so_rcv_q_last_head == NULL);
584 		so->so_rcv_q_last_head = mp;
585 #ifdef DEBUG
586 	} else {
587 		ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
588 #endif
589 	}
590 	so->so_rcv_q_head = mp;
591 
592 #ifdef DEBUG
593 	if (so_debug_length) {
594 		mutex_enter(&so->so_lock);
595 		ASSERT(so_check_length(so));
596 		mutex_exit(&so->so_lock);
597 	}
598 #endif
599 }
600 
601 static void
602 process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
603 {
604 	ASSERT(mp_head->b_prev != NULL);
605 	if (so->so_rcv_q_head  == NULL) {
606 		so->so_rcv_q_head = mp_head;
607 		so->so_rcv_q_last_head = mp_last_head;
608 		ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
609 	} else {
610 		boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
611 		    (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
612 
613 		if (mp_head->b_next == NULL &&
614 		    DB_TYPE(mp_head) == M_DATA &&
615 		    DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
616 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
617 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
618 			mp_head->b_prev = NULL;
619 		} else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
620 			/*
621 			 * Append to last_head if more than one mblks, and both
622 			 * mp_head and last_head are I/OAT mblks.
623 			 */
624 			ASSERT(mp_head->b_next != NULL);
625 			so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
626 			so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
627 			mp_head->b_prev = NULL;
628 
629 			so->so_rcv_q_last_head->b_next = mp_head->b_next;
630 			mp_head->b_next = NULL;
631 			so->so_rcv_q_last_head = mp_last_head;
632 		} else {
633 #ifdef DEBUG
634 			{
635 				mblk_t *tmp_mblk;
636 				tmp_mblk = mp_head;
637 				while (tmp_mblk != NULL) {
638 					ASSERT(tmp_mblk->b_prev != NULL);
639 					tmp_mblk = tmp_mblk->b_next;
640 				}
641 			}
642 #endif
643 			so->so_rcv_q_last_head->b_next = mp_head;
644 			so->so_rcv_q_last_head = mp_last_head;
645 		}
646 	}
647 }
648 
649 int
650 so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
651     rval_t *rvalp, int flags)
652 {
653 	mblk_t	*mp, *nmp;
654 	mblk_t	*savemp, *savemptail;
655 	mblk_t	*new_msg_head;
656 	mblk_t	*new_msg_last_head;
657 	mblk_t	*last_tail;
658 	boolean_t partial_read;
659 	boolean_t reset_atmark = B_FALSE;
660 	int more = 0;
661 	int error;
662 	ssize_t oobmark;
663 	sodirect_t *sodp = so->so_direct;
664 
665 	partial_read = B_FALSE;
666 	*mctlp = NULL;
667 again:
668 	mutex_enter(&so->so_lock);
669 again1:
670 #ifdef DEBUG
671 	if (so_debug_length) {
672 		ASSERT(so_check_length(so));
673 	}
674 #endif
675 	/*
676 	 * First move messages from the dump area to processing area
677 	 */
678 	if (sodp != NULL) {
679 		/* No need to grab sod_lockp since it pointers to so_lock */
680 		if (sodp->sod_state & SOD_ENABLED) {
681 			ASSERT(sodp->sod_lockp == &so->so_lock);
682 
683 			if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
684 				/* nothing to uioamove */
685 				sodp = NULL;
686 			} else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
687 				sodp->sod_uioa.uioa_state &= UIOA_CLR;
688 				sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
689 				/*
690 				 * try to uioamove() the data that
691 				 * has already queued.
692 				 */
693 				sod_uioa_so_init(so, sodp, uiop);
694 			}
695 		} else {
696 			sodp = NULL;
697 		}
698 	}
699 	new_msg_head = so->so_rcv_head;
700 	new_msg_last_head = so->so_rcv_last_head;
701 	so->so_rcv_head = NULL;
702 	so->so_rcv_last_head = NULL;
703 	oobmark = so->so_oobmark;
704 	/*
705 	 * We can release the lock as there can only be one reader
706 	 */
707 	mutex_exit(&so->so_lock);
708 
709 	if (so->so_state & SS_RCVATMARK) {
710 		reset_atmark = B_TRUE;
711 	}
712 	if (new_msg_head != NULL) {
713 		process_new_message(so, new_msg_head, new_msg_last_head);
714 	}
715 	savemp = savemptail = NULL;
716 	rvalp->r_val1 = 0;
717 	error = 0;
718 	mp = so->so_rcv_q_head;
719 
720 	if (mp != NULL &&
721 	    (so->so_rcv_timer_tid == 0 ||
722 	    so->so_rcv_queued >= so->so_rcv_thresh)) {
723 		partial_read = B_FALSE;
724 
725 		if (flags & MSG_PEEK) {
726 			if ((nmp = dupmsg(mp)) == NULL &&
727 			    (nmp = copymsg(mp)) == NULL) {
728 				size_t size = msgsize(mp);
729 
730 				error = strwaitbuf(size, BPRI_HI);
731 				if (error) {
732 					return (error);
733 				}
734 				goto again;
735 			}
736 			mp = nmp;
737 		} else {
738 			ASSERT(mp->b_prev != NULL);
739 			last_tail = mp->b_prev;
740 			mp->b_prev = NULL;
741 			so->so_rcv_q_head = mp->b_next;
742 			if (so->so_rcv_q_head == NULL) {
743 				so->so_rcv_q_last_head = NULL;
744 			}
745 			mp->b_next = NULL;
746 		}
747 
748 		ASSERT(mctlp != NULL);
749 		/*
750 		 * First process PROTO or PCPROTO blocks, if any.
751 		 */
752 		if (DB_TYPE(mp) != M_DATA) {
753 			*mctlp = mp;
754 			savemp = mp;
755 			savemptail = mp;
756 			ASSERT(DB_TYPE(mp) == M_PROTO ||
757 			    DB_TYPE(mp) == M_PCPROTO);
758 			while (mp->b_cont != NULL &&
759 			    DB_TYPE(mp->b_cont) != M_DATA) {
760 				ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
761 				    DB_TYPE(mp->b_cont) == M_PCPROTO);
762 				mp = mp->b_cont;
763 				savemptail = mp;
764 			}
765 			mp = savemptail->b_cont;
766 			savemptail->b_cont = NULL;
767 		}
768 
769 		ASSERT(DB_TYPE(mp) == M_DATA);
770 		/*
771 		 * Now process DATA blocks, if any. Note that for sodirect
772 		 * enabled socket, uio_resid can be 0.
773 		 */
774 		if (uiop->uio_resid >= 0) {
775 			ssize_t copied = 0;
776 
777 			if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
778 				mutex_enter(sodp->sod_lockp);
779 				ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
780 				copied = sod_uioa_mblk(so, mp);
781 				if (copied > 0)
782 					partial_read = B_TRUE;
783 				mutex_exit(sodp->sod_lockp);
784 				/* mark this mblk as processed */
785 				mp = NULL;
786 			} else {
787 				ssize_t oldresid = uiop->uio_resid;
788 
789 				if (MBLKL(mp) < so_mblk_pull_len) {
790 					if (pullupmsg(mp, -1) == 1) {
791 						last_tail = mp;
792 					}
793 				}
794 				/*
795 				 * Can not read beyond the oobmark
796 				 */
797 				mp = socopyoutuio(mp, uiop,
798 				    oobmark == 0 ? INFPSZ : oobmark, &error);
799 				if (error != 0) {
800 					freemsg(*mctlp);
801 					*mctlp = NULL;
802 					more = 0;
803 					goto done;
804 				}
805 				ASSERT(oldresid >= uiop->uio_resid);
806 				copied = oldresid - uiop->uio_resid;
807 				if (oldresid > uiop->uio_resid)
808 					partial_read = B_TRUE;
809 			}
810 			ASSERT(copied >= 0);
811 			if (copied > 0 && !(flags & MSG_PEEK)) {
812 				mutex_enter(&so->so_lock);
813 				so->so_rcv_queued -= copied;
814 				ASSERT(so->so_oobmark >= 0);
815 				if (so->so_oobmark > 0) {
816 					so->so_oobmark -= copied;
817 					ASSERT(so->so_oobmark >= 0);
818 					if (so->so_oobmark == 0) {
819 						ASSERT(so->so_state &
820 						    SS_OOBPEND);
821 						so->so_oobmark = 0;
822 						so->so_state |= SS_RCVATMARK;
823 					}
824 				}
825 				if (so->so_flowctrld && so->so_rcv_queued <
826 				    so->so_rcvlowat) {
827 					so->so_flowctrld = B_FALSE;
828 					mutex_exit(&so->so_lock);
829 					/*
830 					 * Open up flow control. SCTP does
831 					 * not have any downcalls, and it will
832 					 * clr flow ctrl in sosctp_recvmsg().
833 					 */
834 					if (so->so_downcalls != NULL &&
835 					    so->so_downcalls->sd_clr_flowctrl !=
836 					    NULL) {
837 						(*so->so_downcalls->
838 						    sd_clr_flowctrl)
839 						    (so->so_proto_handle);
840 					}
841 				} else {
842 					mutex_exit(&so->so_lock);
843 				}
844 			}
845 		}
846 		if (mp != NULL) { /* more data blocks in msg */
847 			more |= MOREDATA;
848 			if ((flags & (MSG_PEEK|MSG_TRUNC))) {
849 				if (flags & MSG_TRUNC &&
850 				    ((flags & MSG_PEEK) == 0)) {
851 					mutex_enter(&so->so_lock);
852 					so->so_rcv_queued -= msgdsize(mp);
853 					mutex_exit(&so->so_lock);
854 				}
855 				freemsg(mp);
856 			} else if (partial_read && !somsghasdata(mp)) {
857 				/*
858 				 * Avoid queuing a zero-length tail part of
859 				 * a message. partial_read == 1 indicates that
860 				 * we read some of the message.
861 				 */
862 				freemsg(mp);
863 				more &= ~MOREDATA;
864 			} else {
865 				if (savemp != NULL &&
866 				    (flags & MSG_DUPCTRL)) {
867 					mblk_t *nmp;
868 					/*
869 					 * There should only be non data mblks
870 					 */
871 					ASSERT(DB_TYPE(savemp) != M_DATA &&
872 					    DB_TYPE(savemptail) != M_DATA);
873 try_again:
874 					if ((nmp = dupmsg(savemp)) == NULL &&
875 					    (nmp = copymsg(savemp)) == NULL) {
876 
877 						size_t size = msgsize(savemp);
878 
879 						error = strwaitbuf(size,
880 						    BPRI_HI);
881 						if (error != 0) {
882 							/*
883 							 * In case we
884 							 * cannot copy
885 							 * control data
886 							 * free the remaining
887 							 * data.
888 							 */
889 							freemsg(mp);
890 							goto done;
891 						}
892 						goto try_again;
893 					}
894 
895 					ASSERT(nmp != NULL);
896 					ASSERT(DB_TYPE(nmp) != M_DATA);
897 					savemptail->b_cont = mp;
898 					*mctlp = nmp;
899 					mp = savemp;
900 				}
901 				/*
902 				 * putback mp
903 				 */
904 				so_prepend_msg(so, mp, last_tail);
905 			}
906 		}
907 
908 		/* fast check so_rcv_head if there is more data */
909 		if (partial_read && !(so->so_state & SS_RCVATMARK) &&
910 		    *mctlp == NULL && uiop->uio_resid > 0 &&
911 		    !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
912 			goto again;
913 		}
914 	} else if (!partial_read) {
915 		mutex_enter(&so->so_lock);
916 		if (so->so_error != 0) {
917 			error = sogeterr(so, !(flags & MSG_PEEK));
918 			mutex_exit(&so->so_lock);
919 			return (error);
920 		}
921 		/*
922 		 * No pending data. Return right away for nonblocking
923 		 * socket, otherwise sleep waiting for data.
924 		 */
925 		if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
926 			if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
927 			    (flags & MSG_DONTWAIT)) {
928 				error = EWOULDBLOCK;
929 			} else {
930 				if (so->so_state & (SS_CLOSING |
931 				    SS_FALLBACK_PENDING)) {
932 					mutex_exit(&so->so_lock);
933 					error = EINTR;
934 					goto done;
935 				}
936 
937 				if (so->so_rcv_head != NULL) {
938 					goto again1;
939 				}
940 				so->so_rcv_wakeup = B_TRUE;
941 				so->so_rcv_wanted = uiop->uio_resid;
942 				if (so->so_rcvtimeo == 0) {
943 					/*
944 					 * Zero means disable timeout.
945 					 */
946 					error = cv_wait_sig(&so->so_rcv_cv,
947 					    &so->so_lock);
948 				} else {
949 					clock_t now;
950 					time_to_wait(&now, so->so_rcvtimeo);
951 					error = cv_timedwait_sig(&so->so_rcv_cv,
952 					    &so->so_lock, now);
953 				}
954 				so->so_rcv_wakeup = B_FALSE;
955 				so->so_rcv_wanted = 0;
956 
957 				if (error == 0) {
958 					error = EINTR;
959 				} else if (error == -1) {
960 					error = EAGAIN;
961 				} else {
962 					goto again1;
963 				}
964 			}
965 		}
966 		mutex_exit(&so->so_lock);
967 	}
968 	if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
969 		/*
970 		 * We are passed the mark, update state
971 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
972 		 * The draft Posix socket spec states that the mark should
973 		 * not be cleared when peeking. We follow the latter.
974 		 */
975 		mutex_enter(&so->so_lock);
976 		ASSERT(so_verify_oobstate(so));
977 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
978 		freemsg(so->so_oobmsg);
979 		so->so_oobmsg = NULL;
980 		ASSERT(so_verify_oobstate(so));
981 		mutex_exit(&so->so_lock);
982 	}
983 	ASSERT(so->so_rcv_wakeup == B_FALSE);
984 done:
985 	if (sodp != NULL) {
986 		mutex_enter(sodp->sod_lockp);
987 		if ((sodp->sod_state & SOD_ENABLED) &&
988 		    (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
989 			SOD_UIOAFINI(sodp);
990 			if (sodp->sod_uioa.uioa_mbytes > 0) {
991 				ASSERT(so->so_rcv_q_head != NULL ||
992 				    so->so_rcv_head != NULL);
993 				so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
994 				if (error == EWOULDBLOCK)
995 					error = 0;
996 			}
997 		}
998 		mutex_exit(sodp->sod_lockp);
999 	}
1000 #ifdef DEBUG
1001 	if (so_debug_length) {
1002 		mutex_enter(&so->so_lock);
1003 		ASSERT(so_check_length(so));
1004 		mutex_exit(&so->so_lock);
1005 	}
1006 #endif
1007 	rvalp->r_val1 = more;
1008 	return (error);
1009 }
1010 
1011 void
1012 so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1013 {
1014 	ASSERT(MUTEX_HELD(&so->so_lock));
1015 
1016 #ifdef DEBUG
1017 	if (so_debug_length) {
1018 		ASSERT(so_check_length(so));
1019 	}
1020 #endif
1021 	so->so_rcv_queued += msg_size;
1022 
1023 	if (so->so_rcv_head == NULL) {
1024 		ASSERT(so->so_rcv_last_head == NULL);
1025 		so->so_rcv_head = mp;
1026 		so->so_rcv_last_head = mp;
1027 	} else if ((DB_TYPE(mp) == M_DATA &&
1028 	    DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1029 	    ((DB_FLAGS(mp) & DBLK_UIOA) ==
1030 	    (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1031 		/* Added to the end */
1032 		ASSERT(so->so_rcv_last_head != NULL);
1033 		ASSERT(so->so_rcv_last_head->b_prev != NULL);
1034 		so->so_rcv_last_head->b_prev->b_cont = mp;
1035 	} else {
1036 		/* Start a new end */
1037 		so->so_rcv_last_head->b_next = mp;
1038 		so->so_rcv_last_head = mp;
1039 	}
1040 	while (mp->b_cont != NULL)
1041 		mp = mp->b_cont;
1042 
1043 	so->so_rcv_last_head->b_prev = mp;
1044 #ifdef DEBUG
1045 	if (so_debug_length) {
1046 		ASSERT(so_check_length(so));
1047 	}
1048 #endif
1049 }
1050 
1051 /*
1052  * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1053  */
1054 boolean_t
1055 somsghasdata(mblk_t *mp)
1056 {
1057 	for (; mp; mp = mp->b_cont)
1058 		if (mp->b_datap->db_type == M_DATA) {
1059 			ASSERT(mp->b_wptr >= mp->b_rptr);
1060 			if (mp->b_wptr > mp->b_rptr)
1061 				return (B_TRUE);
1062 		}
1063 	return (B_FALSE);
1064 }
1065 
1066 /*
1067  * Flush the read side of sockfs.
1068  *
1069  * The caller must be sure that a reader is not already active when the
1070  * buffer is being flushed.
1071  */
1072 void
1073 so_rcv_flush(struct sonode *so)
1074 {
1075 	mblk_t  *mp;
1076 
1077 	ASSERT(MUTEX_HELD(&so->so_lock));
1078 
1079 	if (so->so_oobmsg != NULL) {
1080 		freemsg(so->so_oobmsg);
1081 		so->so_oobmsg = NULL;
1082 		so->so_oobmark = 0;
1083 		so->so_state &=
1084 		    ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1085 	}
1086 
1087 	/*
1088 	 * Free messages sitting in the send and recv queue
1089 	 */
1090 	while (so->so_rcv_q_head != NULL) {
1091 		mp = so->so_rcv_q_head;
1092 		so->so_rcv_q_head = mp->b_next;
1093 		mp->b_next = mp->b_prev = NULL;
1094 		freemsg(mp);
1095 	}
1096 	while (so->so_rcv_head != NULL) {
1097 		mp = so->so_rcv_head;
1098 		so->so_rcv_head = mp->b_next;
1099 		mp->b_next = mp->b_prev = NULL;
1100 		freemsg(mp);
1101 	}
1102 	so->so_rcv_queued = 0;
1103 	so->so_rcv_q_head = NULL;
1104 	so->so_rcv_q_last_head = NULL;
1105 	so->so_rcv_head = NULL;
1106 	so->so_rcv_last_head = NULL;
1107 }
1108 
1109 /*
1110  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1111  */
1112 int
1113 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1114     boolean_t oob_inline)
1115 {
1116 	mblk_t		*mp, *nmp;
1117 	int		error;
1118 
1119 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1120 	    flags));
1121 
1122 	if (msg != NULL) {
1123 		/*
1124 		 * There is never any oob data with addresses or control since
1125 		 * the T_EXDATA_IND does not carry any options.
1126 		 */
1127 		msg->msg_controllen = 0;
1128 		msg->msg_namelen = 0;
1129 		msg->msg_flags = 0;
1130 	}
1131 
1132 	mutex_enter(&so->so_lock);
1133 	ASSERT(so_verify_oobstate(so));
1134 	if (oob_inline ||
1135 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1136 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1137 		mutex_exit(&so->so_lock);
1138 		return (EINVAL);
1139 	}
1140 	if (!(so->so_state & SS_HAVEOOBDATA)) {
1141 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
1142 		mutex_exit(&so->so_lock);
1143 		return (EWOULDBLOCK);
1144 	}
1145 	ASSERT(so->so_oobmsg != NULL);
1146 	mp = so->so_oobmsg;
1147 	if (flags & MSG_PEEK) {
1148 		/*
1149 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
1150 		 * Instead we revert to the consolidation private
1151 		 * allocb_wait plus bcopy.
1152 		 */
1153 		mblk_t *mp1;
1154 
1155 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1156 		ASSERT(mp1);
1157 
1158 		while (mp != NULL) {
1159 			ssize_t size;
1160 
1161 			size = MBLKL(mp);
1162 			bcopy(mp->b_rptr, mp1->b_wptr, size);
1163 			mp1->b_wptr += size;
1164 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1165 			mp = mp->b_cont;
1166 		}
1167 		mp = mp1;
1168 	} else {
1169 		/*
1170 		 * Update the state indicating that the data has been consumed.
1171 		 * Keep SS_OOBPEND set until data is consumed past the mark.
1172 		 */
1173 		so->so_oobmsg = NULL;
1174 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1175 	}
1176 	ASSERT(so_verify_oobstate(so));
1177 	mutex_exit(&so->so_lock);
1178 
1179 	error = 0;
1180 	nmp = mp;
1181 	while (nmp != NULL && uiop->uio_resid > 0) {
1182 		ssize_t n = MBLKL(nmp);
1183 
1184 		n = MIN(n, uiop->uio_resid);
1185 		if (n > 0)
1186 			error = uiomove(nmp->b_rptr, n,
1187 			    UIO_READ, uiop);
1188 		if (error)
1189 			break;
1190 		nmp = nmp->b_cont;
1191 	}
1192 	ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1193 	freemsg(mp);
1194 	return (error);
1195 }
1196 
1197 /*
1198  * Allocate and initializ sonode
1199  */
1200 /* ARGSUSED */
1201 struct sonode *
1202 socket_sonode_create(struct sockparams *sp, int family, int type,
1203     int protocol, int version, int sflags, int *errorp, struct cred *cr)
1204 {
1205 	sonode_t *so;
1206 	int	kmflags;
1207 
1208 	/*
1209 	 * Choose the right set of sonodeops based on the upcall and
1210 	 * down call version that the protocol has provided
1211 	 */
1212 	if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1213 	    SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1214 		/*
1215 		 * mismatch
1216 		 */
1217 #ifdef DEBUG
1218 		cmn_err(CE_CONT, "protocol and socket module version mismatch");
1219 #endif
1220 		*errorp = EINVAL;
1221 		return (NULL);
1222 	}
1223 
1224 	kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1225 
1226 	so = kmem_cache_alloc(socket_cache, kmflags);
1227 	if (so == NULL) {
1228 		*errorp = ENOMEM;
1229 		return (NULL);
1230 	}
1231 
1232 	sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1233 
1234 	if (version == SOV_DEFAULT)
1235 		version = so_default_version;
1236 
1237 	so->so_version = (short)version;
1238 
1239 	/*
1240 	 * set the default values to be INFPSZ
1241 	 * if a protocol desires it can change the value later
1242 	 */
1243 	so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1244 	so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1245 	so->so_proto_props.sopp_maxpsz = INFPSZ;
1246 	so->so_proto_props.sopp_maxblk = INFPSZ;
1247 
1248 	return (so);
1249 }
1250 
1251 int
1252 socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1253 {
1254 	int error = 0;
1255 
1256 	if (pso != NULL) {
1257 		/*
1258 		 * We have a passive open, so inherit basic state from
1259 		 * the parent (listener).
1260 		 *
1261 		 * No need to grab the new sonode's lock, since there is no
1262 		 * one that can have a reference to it.
1263 		 */
1264 		mutex_enter(&pso->so_lock);
1265 
1266 		so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1267 		so->so_pgrp = pso->so_pgrp;
1268 		so->so_rcvtimeo = pso->so_rcvtimeo;
1269 		so->so_sndtimeo = pso->so_sndtimeo;
1270 		so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
1271 		/*
1272 		 * Make note of the socket level options. TCP and IP level
1273 		 * options are already inherited. We could do all this after
1274 		 * accept is successful but doing it here simplifies code and
1275 		 * no harm done for error case.
1276 		 */
1277 		so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
1278 		    SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1279 		    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1280 		so->so_proto_props = pso->so_proto_props;
1281 		so->so_mode = pso->so_mode;
1282 		so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
1283 
1284 		mutex_exit(&pso->so_lock);
1285 
1286 		if (uioasync.enabled) {
1287 			sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
1288 		}
1289 		return (0);
1290 	} else {
1291 		struct sockparams *sp = so->so_sockparams;
1292 		sock_upcalls_t *upcalls_to_use;
1293 
1294 		/*
1295 		 * Based on the version number select the right upcalls to
1296 		 * pass down. Currently we only have one version so choose
1297 		 * default
1298 		 */
1299 		upcalls_to_use = &so_upcalls;
1300 
1301 		/* active open, so create a lower handle */
1302 		so->so_proto_handle =
1303 		    sp->sp_smod_info->smod_proto_create_func(so->so_family,
1304 		    so->so_type, so->so_protocol, &so->so_downcalls,
1305 		    &so->so_mode, &error, flags, cr);
1306 
1307 		if (so->so_proto_handle == NULL) {
1308 			ASSERT(error != 0);
1309 			/*
1310 			 * To be safe; if a lower handle cannot be created, and
1311 			 * the proto does not give a reason why, assume there
1312 			 * was a lack of memory.
1313 			 */
1314 			return ((error == 0) ? ENOMEM : error);
1315 		}
1316 		ASSERT(so->so_downcalls != NULL);
1317 		ASSERT(so->so_downcalls->sd_send != NULL ||
1318 		    so->so_downcalls->sd_send_uio != NULL);
1319 		if (so->so_downcalls->sd_recv_uio != NULL) {
1320 			ASSERT(so->so_downcalls->sd_poll != NULL);
1321 			so->so_pollev |= SO_POLLEV_ALWAYS;
1322 		}
1323 
1324 		(*so->so_downcalls->sd_activate)(so->so_proto_handle,
1325 		    (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1326 
1327 		/* Wildcard */
1328 
1329 		/*
1330 		 * FIXME No need for this, the protocol can deal with it in
1331 		 * sd_create(). Should update ICMP.
1332 		 */
1333 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
1334 			int protocol = so->so_protocol;
1335 			int error;
1336 			/*
1337 			 * Issue SO_PROTOTYPE setsockopt.
1338 			 */
1339 			error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1340 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
1341 			if (error) {
1342 				(void) (*so->so_downcalls->sd_close)
1343 				    (so->so_proto_handle, 0, cr);
1344 
1345 				mutex_enter(&so->so_lock);
1346 				so_rcv_flush(so);
1347 				mutex_exit(&so->so_lock);
1348 				/*
1349 				 * Setsockopt often fails with ENOPROTOOPT but
1350 				 * socket() should fail with
1351 				 * EPROTONOSUPPORT/EPROTOTYPE.
1352 				 */
1353 				return (EPROTONOSUPPORT);
1354 			}
1355 		}
1356 		return (0);
1357 	}
1358 }
1359 
1360 /*
1361  * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1362  *         struct cred *cr, int32_t *rvalp)
1363  *
1364  * Handle ioctls that manipulate basic socket state; non-blocking,
1365  * async, etc.
1366  *
1367  * Returns:
1368  *   < 0  - ioctl was not handle
1369  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1370  *
1371  * Notes:
1372  *   Assumes the standard receive buffer is used to obtain info for
1373  *   NREAD.
1374  */
1375 /* ARGSUSED */
1376 int
1377 socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1378     struct cred *cr, int32_t *rvalp)
1379 {
1380 	switch (cmd) {
1381 	case SIOCSQPTR:
1382 		/*
1383 		 * SIOCSQPTR is valid only when helper stream is created
1384 		 * by the protocol.
1385 		 */
1386 
1387 		return (EOPNOTSUPP);
1388 	case FIONBIO: {
1389 		int32_t value;
1390 
1391 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
1392 		    (mode & (int)FKIOCTL)))
1393 			return (EFAULT);
1394 
1395 		mutex_enter(&so->so_lock);
1396 		if (value) {
1397 			so->so_state |= SS_NDELAY;
1398 		} else {
1399 			so->so_state &= ~SS_NDELAY;
1400 		}
1401 		mutex_exit(&so->so_lock);
1402 		return (0);
1403 	}
1404 	case FIOASYNC: {
1405 		int32_t value;
1406 
1407 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
1408 		    (mode & (int)FKIOCTL)))
1409 			return (EFAULT);
1410 
1411 		mutex_enter(&so->so_lock);
1412 
1413 		if (value) {
1414 			/* Turn on SIGIO */
1415 			so->so_state |= SS_ASYNC;
1416 		} else {
1417 			/* Turn off SIGIO */
1418 			so->so_state &= ~SS_ASYNC;
1419 		}
1420 		mutex_exit(&so->so_lock);
1421 
1422 		return (0);
1423 	}
1424 
1425 	case SIOCSPGRP:
1426 	case FIOSETOWN: {
1427 		int error;
1428 		pid_t pid;
1429 
1430 		if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1431 		    (mode & (int)FKIOCTL)))
1432 			return (EFAULT);
1433 
1434 		mutex_enter(&so->so_lock);
1435 		error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1436 		mutex_exit(&so->so_lock);
1437 		return (error);
1438 	}
1439 	case SIOCGPGRP:
1440 	case FIOGETOWN:
1441 		if (so_copyout(&so->so_pgrp, (void *)arg,
1442 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
1443 			return (EFAULT);
1444 
1445 		return (0);
1446 	case SIOCATMARK: {
1447 		int retval;
1448 
1449 		/*
1450 		 * Only protocols that support urgent data can handle ATMARK.
1451 		 */
1452 		if ((so->so_mode & SM_EXDATA) == 0)
1453 			return (EINVAL);
1454 
1455 		/*
1456 		 * If the protocol is maintaining its own buffer, then the
1457 		 * request must be passed down.
1458 		 */
1459 		if (so->so_downcalls->sd_recv_uio != NULL)
1460 			return (-1);
1461 
1462 		retval = (so->so_state & SS_RCVATMARK) != 0;
1463 
1464 		if (so_copyout(&retval, (void *)arg, sizeof (int),
1465 		    (mode & (int)FKIOCTL))) {
1466 			return (EFAULT);
1467 		}
1468 		return (0);
1469 	}
1470 
1471 	case FIONREAD: {
1472 		int retval;
1473 
1474 		/*
1475 		 * If the protocol is maintaining its own buffer, then the
1476 		 * request must be passed down.
1477 		 */
1478 		if (so->so_downcalls->sd_recv_uio != NULL)
1479 			return (-1);
1480 
1481 		retval = MIN(so->so_rcv_queued, INT_MAX);
1482 
1483 		if (so_copyout(&retval, (void *)arg,
1484 		    sizeof (retval), (mode & (int)FKIOCTL))) {
1485 			return (EFAULT);
1486 		}
1487 		return (0);
1488 	}
1489 
1490 	case _I_GETPEERCRED: {
1491 		int error = 0;
1492 
1493 		if ((mode & FKIOCTL) == 0)
1494 			return (EINVAL);
1495 
1496 		mutex_enter(&so->so_lock);
1497 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1498 			error = ENOTSUP;
1499 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
1500 			error = ENOTCONN;
1501 		} else if (so->so_peercred != NULL) {
1502 			k_peercred_t *kp = (k_peercred_t *)arg;
1503 			kp->pc_cr = so->so_peercred;
1504 			kp->pc_cpid = so->so_cpid;
1505 			crhold(so->so_peercred);
1506 		} else {
1507 			error = EINVAL;
1508 		}
1509 		mutex_exit(&so->so_lock);
1510 		return (error);
1511 	}
1512 	default:
1513 		return (-1);
1514 	}
1515 }
1516 
1517 /*
1518  * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
1519  * then the socket will fall back to TPI.
1520  *
1521  * Returns:
1522  *   < 0  - ioctl was not handle
1523  *  >= 0  - ioctl was handled, if > 0, then it is an errno
1524  */
1525 int
1526 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1527     struct cred *cr, int32_t *rvalp)
1528 {
1529 	switch (cmd) {
1530 	case _I_INSERT:
1531 	case _I_REMOVE:
1532 	case I_FIND:
1533 	case I_LIST:
1534 		return (EOPNOTSUPP);
1535 
1536 	case I_PUSH:
1537 	case I_POP: {
1538 		int retval;
1539 
1540 		if ((retval = so_tpi_fallback(so, cr)) == 0) {
1541 			/* Reissue the ioctl */
1542 			ASSERT(so->so_rcv_q_head == NULL);
1543 			return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1544 		}
1545 		return (retval);
1546 	}
1547 	case I_LOOK:
1548 		if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1549 		    (mode & (int)FKIOCTL))) {
1550 			return (EFAULT);
1551 		}
1552 		return (0);
1553 	default:
1554 		return (-1);
1555 	}
1556 }
1557 
1558 int
1559 socket_getopt_common(struct sonode *so, int level, int option_name,
1560     void *optval, socklen_t *optlenp, int flags)
1561 {
1562 	if (level != SOL_SOCKET)
1563 		return (-1);
1564 
1565 	switch (option_name) {
1566 	case SO_ERROR:
1567 	case SO_DOMAIN:
1568 	case SO_TYPE:
1569 	case SO_ACCEPTCONN: {
1570 		int32_t value;
1571 		socklen_t optlen = *optlenp;
1572 
1573 		if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1574 			return (EINVAL);
1575 		}
1576 
1577 		switch (option_name) {
1578 		case SO_ERROR:
1579 			mutex_enter(&so->so_lock);
1580 			value = sogeterr(so, B_TRUE);
1581 			mutex_exit(&so->so_lock);
1582 			break;
1583 		case SO_DOMAIN:
1584 			value = so->so_family;
1585 			break;
1586 		case SO_TYPE:
1587 			value = so->so_type;
1588 			break;
1589 		case SO_ACCEPTCONN:
1590 			if (so->so_state & SS_ACCEPTCONN)
1591 				value = SO_ACCEPTCONN;
1592 			else
1593 				value = 0;
1594 			break;
1595 		}
1596 
1597 		bcopy(&value, optval, sizeof (value));
1598 		*optlenp = sizeof (value);
1599 
1600 		return (0);
1601 	}
1602 	case SO_SNDTIMEO:
1603 	case SO_RCVTIMEO: {
1604 		clock_t value;
1605 		socklen_t optlen = *optlenp;
1606 
1607 		if (get_udatamodel() == DATAMODEL_NONE ||
1608 		    get_udatamodel() == DATAMODEL_NATIVE) {
1609 			if (optlen < sizeof (struct timeval))
1610 				return (EINVAL);
1611 		} else {
1612 			if (optlen < sizeof (struct timeval32))
1613 				return (EINVAL);
1614 		}
1615 		if (option_name == SO_RCVTIMEO)
1616 			value = drv_hztousec(so->so_rcvtimeo);
1617 		else
1618 			value = drv_hztousec(so->so_sndtimeo);
1619 
1620 		if (get_udatamodel() == DATAMODEL_NONE ||
1621 		    get_udatamodel() == DATAMODEL_NATIVE) {
1622 			((struct timeval *)(optval))->tv_sec =
1623 			    value / (1000 * 1000);
1624 			((struct timeval *)(optval))->tv_usec =
1625 			    value % (1000 * 1000);
1626 			*optlenp = sizeof (struct timeval);
1627 		} else {
1628 			((struct timeval32 *)(optval))->tv_sec =
1629 			    value / (1000 * 1000);
1630 			((struct timeval32 *)(optval))->tv_usec =
1631 			    value % (1000 * 1000);
1632 			*optlenp = sizeof (struct timeval32);
1633 		}
1634 		return (0);
1635 	}
1636 	case SO_DEBUG:
1637 	case SO_REUSEADDR:
1638 	case SO_KEEPALIVE:
1639 	case SO_DONTROUTE:
1640 	case SO_BROADCAST:
1641 	case SO_USELOOPBACK:
1642 	case SO_OOBINLINE:
1643 	case SO_SNDBUF:
1644 #ifdef notyet
1645 	case SO_SNDLOWAT:
1646 	case SO_RCVLOWAT:
1647 #endif /* notyet */
1648 	case SO_DGRAM_ERRIND: {
1649 		socklen_t optlen = *optlenp;
1650 
1651 		if (optlen < (t_uscalar_t)sizeof (int32_t))
1652 			return (EINVAL);
1653 		break;
1654 	}
1655 	case SO_RCVBUF: {
1656 		socklen_t optlen = *optlenp;
1657 
1658 		if (optlen < (t_uscalar_t)sizeof (int32_t))
1659 			return (EINVAL);
1660 
1661 		if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1662 			/*
1663 			 * XXX If SO_RCVBUF has been set and this is an
1664 			 * XPG 4.2 application then do not ask the transport
1665 			 * since the transport might adjust the value and not
1666 			 * return exactly what was set by the application.
1667 			 * For non-XPG 4.2 application we return the value
1668 			 * that the transport is actually using.
1669 			 */
1670 			*(int32_t *)optval = so->so_xpg_rcvbuf;
1671 			*optlenp = sizeof (so->so_xpg_rcvbuf);
1672 			return (0);
1673 		}
1674 		/*
1675 		 * If the option has not been set then get a default
1676 		 * value from the transport.
1677 		 */
1678 		break;
1679 	}
1680 	case SO_LINGER: {
1681 		socklen_t optlen = *optlenp;
1682 
1683 		if (optlen < (t_uscalar_t)sizeof (struct linger))
1684 			return (EINVAL);
1685 		break;
1686 	}
1687 	case SO_SND_BUFINFO: {
1688 		socklen_t optlen = *optlenp;
1689 
1690 		if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1691 			return (EINVAL);
1692 		((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1693 		    (so->so_proto_props).sopp_wroff;
1694 		((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1695 		    (so->so_proto_props).sopp_maxblk;
1696 		((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1697 		    (so->so_proto_props).sopp_maxpsz;
1698 		((struct so_snd_bufinfo *)(optval))->sbi_tail =
1699 		    (so->so_proto_props).sopp_tail;
1700 		*optlenp = sizeof (struct so_snd_bufinfo);
1701 		return (0);
1702 	}
1703 	default:
1704 		break;
1705 	}
1706 
1707 	/* Unknown Option */
1708 	return (-1);
1709 }
1710 
1711 void
1712 socket_sonode_destroy(struct sonode *so)
1713 {
1714 	sonode_fini(so);
1715 	kmem_cache_free(socket_cache, so);
1716 }
1717 
1718 int
1719 so_zcopy_wait(struct sonode *so)
1720 {
1721 	int error = 0;
1722 
1723 	mutex_enter(&so->so_lock);
1724 	while (!(so->so_copyflag & STZCNOTIFY)) {
1725 		if (so->so_state & SS_CLOSING) {
1726 			mutex_exit(&so->so_lock);
1727 			return (EINTR);
1728 		}
1729 		if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1730 			error = EINTR;
1731 			break;
1732 		}
1733 	}
1734 	so->so_copyflag &= ~STZCNOTIFY;
1735 	mutex_exit(&so->so_lock);
1736 	return (error);
1737 }
1738 
1739 void
1740 so_timer_callback(void *arg)
1741 {
1742 	struct sonode *so = (struct sonode *)arg;
1743 
1744 	mutex_enter(&so->so_lock);
1745 
1746 	so->so_rcv_timer_tid = 0;
1747 	if (so->so_rcv_queued > 0) {
1748 		so_notify_data(so, so->so_rcv_queued);
1749 	} else {
1750 		mutex_exit(&so->so_lock);
1751 	}
1752 }
1753 
1754 #ifdef DEBUG
1755 /*
1756  * Verify that the length stored in so_rcv_queued and the length of data blocks
1757  * queued is same.
1758  */
1759 static boolean_t
1760 so_check_length(sonode_t *so)
1761 {
1762 	mblk_t *mp = so->so_rcv_q_head;
1763 	int len = 0;
1764 
1765 	ASSERT(MUTEX_HELD(&so->so_lock));
1766 
1767 	if (mp != NULL) {
1768 		len = msgdsize(mp);
1769 		while ((mp = mp->b_next) != NULL)
1770 			len += msgdsize(mp);
1771 	}
1772 	mp = so->so_rcv_head;
1773 	if (mp != NULL) {
1774 		len += msgdsize(mp);
1775 		while ((mp = mp->b_next) != NULL)
1776 			len += msgdsize(mp);
1777 	}
1778 	return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1779 }
1780 #endif
1781 
1782 int
1783 so_get_mod_version(struct sockparams *sp)
1784 {
1785 	ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1786 	return (sp->sp_smod_info->smod_version);
1787 }
1788 
1789 /*
1790  * so_start_fallback()
1791  *
1792  * Block new socket operations from coming in, and wait for active operations
1793  * to complete. Threads that are sleeping will be woken up so they can get
1794  * out of the way.
1795  *
1796  * The caller must be a reader on so_fallback_rwlock.
1797  */
1798 static boolean_t
1799 so_start_fallback(struct sonode *so)
1800 {
1801 	ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1802 
1803 	mutex_enter(&so->so_lock);
1804 	if (so->so_state & SS_FALLBACK_PENDING) {
1805 		mutex_exit(&so->so_lock);
1806 		return (B_FALSE);
1807 	}
1808 	so->so_state |= SS_FALLBACK_PENDING;
1809 	/*
1810 	 * Poke all threads that might be sleeping. Any operation that comes
1811 	 * in after the cv_broadcast will observe the fallback pending flag
1812 	 * which cause the call to return where it would normally sleep.
1813 	 */
1814 	cv_broadcast(&so->so_state_cv);		/* threads in connect() */
1815 	cv_broadcast(&so->so_rcv_cv);		/* threads in recvmsg() */
1816 	cv_broadcast(&so->so_snd_cv);		/* threads in sendmsg() */
1817 	mutex_enter(&so->so_acceptq_lock);
1818 	cv_broadcast(&so->so_acceptq_cv);	/* threads in accept() */
1819 	mutex_exit(&so->so_acceptq_lock);
1820 	mutex_exit(&so->so_lock);
1821 
1822 	/*
1823 	 * The main reason for the rw_tryupgrade call is to provide
1824 	 * observability during the fallback process. We want to
1825 	 * be able to see if there are pending operations.
1826 	 */
1827 	if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1828 		/*
1829 		 * It is safe to drop and reaquire the fallback lock, because
1830 		 * we are guaranteed that another fallback cannot take place.
1831 		 */
1832 		rw_exit(&so->so_fallback_rwlock);
1833 		DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1834 		rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1835 		DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1836 	}
1837 
1838 	return (B_TRUE);
1839 }
1840 
1841 /*
1842  * so_end_fallback()
1843  *
1844  * Allow socket opertions back in.
1845  *
1846  * The caller must be a writer on so_fallback_rwlock.
1847  */
1848 static void
1849 so_end_fallback(struct sonode *so)
1850 {
1851 	ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1852 
1853 	mutex_enter(&so->so_lock);
1854 	so->so_state &= ~SS_FALLBACK_PENDING;
1855 	mutex_exit(&so->so_lock);
1856 
1857 	rw_downgrade(&so->so_fallback_rwlock);
1858 }
1859 
1860 /*
1861  * so_quiesced_cb()
1862  *
1863  * Callback passed to the protocol during fallback. It is called once
1864  * the endpoint is quiescent.
1865  *
1866  * No requests from the user, no notifications from the protocol, so it
1867  * is safe to synchronize the state. Data can also be moved without
1868  * risk for reordering.
1869  *
1870  * NOTE: urgent data is dropped on the floor.
1871  *
1872  * We do not need to hold so_lock, since there can be only one thread
1873  * operating on the sonode.
1874  */
1875 static void
1876 so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1877     struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1878     struct sockaddr *faddr, socklen_t faddrlen, short opts)
1879 {
1880 	struct sonode *so = (struct sonode *)sock_handle;
1881 
1882 	sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1883 
1884 	mutex_enter(&so->so_lock);
1885 	SOCKET_TIMER_CANCEL(so);
1886 	mutex_exit(&so->so_lock);
1887 	/*
1888 	 * Move data to the STREAM head.
1889 	 */
1890 	if (so->so_rcv_head != NULL) {
1891 		if (so->so_rcv_q_last_head == NULL)
1892 			so->so_rcv_q_head = so->so_rcv_head;
1893 		else
1894 			so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1895 		so->so_rcv_q_last_head = so->so_rcv_last_head;
1896 	}
1897 
1898 	while (so->so_rcv_q_head != NULL) {
1899 		mblk_t *mp = so->so_rcv_q_head;
1900 		size_t mlen = msgdsize(mp);
1901 
1902 		so->so_rcv_q_head = mp->b_next;
1903 		mp->b_next = NULL;
1904 		mp->b_prev = NULL;
1905 		so->so_rcv_queued -= mlen;
1906 		putnext(q, mp);
1907 	}
1908 	ASSERT(so->so_rcv_queued == 0);
1909 	so->so_rcv_head = NULL;
1910 	so->so_rcv_last_head = NULL;
1911 	so->so_rcv_q_head = NULL;
1912 	so->so_rcv_q_last_head = NULL;
1913 
1914 #ifdef DEBUG
1915 	if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
1916 		cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
1917 	}
1918 #endif
1919 	if (so->so_oobmsg != NULL) {
1920 		freemsg(so->so_oobmsg);
1921 		so->so_oobmsg = NULL;
1922 	}
1923 	so->so_oobmark = 0;
1924 
1925 	ASSERT(so->so_rcv_queued == 0);
1926 }
1927 
1928 /*
1929  * so_tpi_fallback()
1930  *
1931  * This is fallback initation routine; things start here.
1932  *
1933  * Basic strategy:
1934  *   o Block new socket operations from coming in
1935  *   o Allocate/initate info needed by TPI
1936  *   o Quiesce the connection, at which point we sync
1937  *     state and move data
1938  *   o Change operations (sonodeops) associated with the socket
1939  *   o Unblock threads waiting for the fallback to finish
1940  */
1941 int
1942 so_tpi_fallback(struct sonode *so, struct cred *cr)
1943 {
1944 	int error;
1945 	queue_t *q;
1946 	struct sockparams *sp;
1947 	struct sockparams *newsp;
1948 	so_proto_fallback_func_t fbfunc;
1949 	boolean_t direct;
1950 
1951 	error = 0;
1952 	sp = so->so_sockparams;
1953 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
1954 
1955 	/*
1956 	 * Fallback can only happen if there is a device associated
1957 	 * with the sonode, and the socket module has a fallback function.
1958 	 */
1959 	if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
1960 		return (EINVAL);
1961 
1962 	/*
1963 	 * Initiate fallback; upon success we know that no new requests
1964 	 * will come in from the user.
1965 	 */
1966 	if (!so_start_fallback(so))
1967 		return (EAGAIN);
1968 
1969 	newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
1970 	    so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
1971 	    KM_SLEEP, &error);
1972 	if (error != 0)
1973 		goto out;
1974 
1975 	if (so->so_direct != NULL) {
1976 		sodirect_t *sodp = so->so_direct;
1977 		mutex_enter(sodp->sod_lockp);
1978 
1979 		so->so_direct->sod_state &= ~SOD_ENABLED;
1980 		so->so_state &= ~SS_SODIRECT;
1981 		ASSERT(sodp->sod_uioafh == NULL);
1982 		mutex_exit(sodp->sod_lockp);
1983 	}
1984 
1985 	/* Turn sonode into a TPI socket */
1986 	q = sotpi_convert_sonode(so, newsp, &direct, cr);
1987 	if (q == NULL) {
1988 		zcmn_err(getzoneid(), CE_WARN,
1989 		    "Failed to convert socket to TPI. Pid = %d\n",
1990 		    curproc->p_pid);
1991 		SOCKPARAMS_DEC_REF(newsp);
1992 		error = EINVAL;
1993 		goto out;
1994 	}
1995 
1996 	/*
1997 	 * Now tell the protocol to start using TPI. so_quiesced_cb be
1998 	 * called once it's safe to synchronize state.
1999 	 */
2000 	DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
2001 	/* FIXME assumes this cannot fail. TCP can fail to enter squeue */
2002 	(*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
2003 	DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2004 
2005 	/*
2006 	 * Free all pending connection indications, i.e., socket_accept() has
2007 	 * not yet pulled the connection of the queue. The transport sent
2008 	 * a T_CONN_IND message for each pending connection to the STREAM head.
2009 	 */
2010 	so_acceptq_flush(so);
2011 
2012 	mutex_enter(&so->so_lock);
2013 	so->so_state |= SS_FALLBACK_COMP;
2014 	mutex_exit(&so->so_lock);
2015 
2016 	/*
2017 	 * Swap the sonode ops. Socket opertations that come in once this
2018 	 * is done will proceed without blocking.
2019 	 */
2020 	so->so_ops = &sotpi_sonodeops;
2021 
2022 	/*
2023 	 * No longer a non streams socket
2024 	 */
2025 	so->so_not_str = B_FALSE;
2026 	/*
2027 	 * Wake up any threads stuck in poll. This is needed since the poll
2028 	 * head changes when the fallback happens (moves from the sonode to
2029 	 * the STREAMS head).
2030 	 */
2031 	pollwakeup(&so->so_poll_list, POLLERR);
2032 out:
2033 	so_end_fallback(so);
2034 
2035 	return (error);
2036 }
2037